/*
 * Copyright (c) 2012 Mellanox Technologies, Inc.  All rights reserved.
 * Copyright (c) 2020 Intel Corporation.  All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#define _GNU_SOURCE
#include <config.h>

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <sys/mman.h>
#include <pthread.h>
#include <string.h>
#include <sched.h>
#include <sys/param.h>

#include <util/symver.h>
#include <rdma/mlx5_user_ioctl_cmds.h>

#include "mlx5.h"
#include "mlx5-abi.h"
#include "wqe.h"
#include "mlx5_ifc.h"
#include "mlx5_vfio.h"

static void mlx5_free_context(struct ibv_context *ibctx);
static bool is_mlx5_dev(struct ibv_device *device);

#ifndef CPU_OR
#define CPU_OR(x, y, z) do {} while (0)
#endif

#ifndef CPU_EQUAL
#define CPU_EQUAL(x, y) 1
#endif

#define HCA(v, d) VERBS_PCI_MATCH(PCI_VENDOR_ID_##v, d, NULL)
const struct verbs_match_ent mlx5_hca_table[] = {
	VERBS_DRIVER_ID(RDMA_DRIVER_MLX5),
	HCA(MELLANOX, 0x1011),	/* MT4113 Connect-IB */
	HCA(MELLANOX, 0x1012),	/* Connect-IB Virtual Function */
	HCA(MELLANOX, 0x1013),	/* ConnectX-4 */
	HCA(MELLANOX, 0x1014),	/* ConnectX-4 Virtual Function */
	HCA(MELLANOX, 0x1015),	/* ConnectX-4LX */
	HCA(MELLANOX, 0x1016),	/* ConnectX-4LX Virtual Function */
	HCA(MELLANOX, 0x1017),	/* ConnectX-5, PCIe 3.0 */
	HCA(MELLANOX, 0x1018),	/* ConnectX-5 Virtual Function */
	HCA(MELLANOX, 0x1019),    /* ConnectX-5 Ex */
	HCA(MELLANOX, 0x101a),	/* ConnectX-5 Ex VF */
	HCA(MELLANOX, 0x101b),    /* ConnectX-6 */
	HCA(MELLANOX, 0x101c),	/* ConnectX-6 VF */
	HCA(MELLANOX, 0x101d),	/* ConnectX-6 DX */
	HCA(MELLANOX, 0x101e),	/* ConnectX family mlx5Gen Virtual Function */
	HCA(MELLANOX, 0x101f),	/* ConnectX-6 LX */
	HCA(MELLANOX, 0x1021),  /* ConnectX-7 */
	HCA(MELLANOX, 0x1023),  /* ConnectX-8 */
	HCA(MELLANOX, 0x1025),  /* ConnectX-9 */
	HCA(MELLANOX, 0xa2d2),	/* BlueField integrated ConnectX-5 network controller */
	HCA(MELLANOX, 0xa2d3),	/* BlueField integrated ConnectX-5 network controller VF */
	HCA(MELLANOX, 0xa2d6),  /* BlueField-2 integrated ConnectX-6 Dx network controller */
	HCA(MELLANOX, 0xa2dc),  /* BlueField-3 integrated ConnectX-7 network controller */
	HCA(MELLANOX, 0xa2df),  /* BlueField-4 integrated ConnectX-8 network controller */
	{}
};

uint32_t mlx5_debug_mask = 0;
int mlx5_freeze_on_error_cqe;

static const struct verbs_context_ops mlx5_ctx_common_ops = {
	.query_port    = mlx5_query_port,
	.alloc_pd      = mlx5_alloc_pd,
	.async_event   = mlx5_async_event,
	.dealloc_pd    = mlx5_free_pd,
	.reg_mr	       = mlx5_reg_mr,
	.reg_dmabuf_mr = mlx5_reg_dmabuf_mr,
	.rereg_mr      = mlx5_rereg_mr,
	.dereg_mr      = mlx5_dereg_mr,
	.alloc_mw      = mlx5_alloc_mw,
	.dealloc_mw    = mlx5_dealloc_mw,
	.bind_mw       = mlx5_bind_mw,
	.create_cq     = mlx5_create_cq,
	.poll_cq       = mlx5_poll_cq,
	.req_notify_cq = mlx5_arm_cq,
	.cq_event      = mlx5_cq_event,
	.resize_cq     = mlx5_resize_cq,
	.destroy_cq    = mlx5_destroy_cq,
	.create_srq    = mlx5_create_srq,
	.modify_srq    = mlx5_modify_srq,
	.query_srq     = mlx5_query_srq,
	.destroy_srq   = mlx5_destroy_srq,
	.post_srq_recv = mlx5_post_srq_recv,
	.create_qp     = mlx5_create_qp,
	.query_qp      = mlx5_query_qp,
	.modify_qp     = mlx5_modify_qp,
	.destroy_qp    = mlx5_destroy_qp,
	.post_send     = mlx5_post_send,
	.post_recv     = mlx5_post_recv,
	.create_ah     = mlx5_create_ah,
	.destroy_ah    = mlx5_destroy_ah,
	.attach_mcast  = mlx5_attach_mcast,
	.detach_mcast  = mlx5_detach_mcast,

	.advise_mr = mlx5_advise_mr,
	.alloc_dm = mlx5_alloc_dm,
	.alloc_parent_domain = mlx5_alloc_parent_domain,
	.alloc_td = mlx5_alloc_td,
	.attach_counters_point_flow = mlx5_attach_counters_point_flow,
	.close_xrcd = mlx5_close_xrcd,
	.create_counters = mlx5_create_counters,
	.create_cq_ex = mlx5_create_cq_ex,
	.create_flow = mlx5_create_flow,
	.create_flow_action_esp = mlx5_create_flow_action_esp,
	.create_qp_ex = mlx5_create_qp_ex,
	.create_rwq_ind_table = mlx5_create_rwq_ind_table,
	.create_srq_ex = mlx5_create_srq_ex,
	.create_wq = mlx5_create_wq,
	.dealloc_td = mlx5_dealloc_td,
	.destroy_counters = mlx5_destroy_counters,
	.destroy_flow = mlx5_destroy_flow,
	.destroy_flow_action = mlx5_destroy_flow_action,
	.destroy_rwq_ind_table = mlx5_destroy_rwq_ind_table,
	.destroy_wq = mlx5_destroy_wq,
	.free_dm = mlx5_free_dm,
	.get_srq_num = mlx5_get_srq_num,
	.import_dm = mlx5_import_dm,
	.import_mr = mlx5_import_mr,
	.import_pd = mlx5_import_pd,
	.modify_cq = mlx5_modify_cq,
	.modify_flow_action_esp = mlx5_modify_flow_action_esp,
	.modify_qp_rate_limit = mlx5_modify_qp_rate_limit,
	.modify_wq = mlx5_modify_wq,
	.open_qp = mlx5_open_qp,
	.open_xrcd = mlx5_open_xrcd,
	.post_srq_ops = mlx5_post_srq_ops,
	.query_device_ex = mlx5_query_device_ex,
	.query_ece = mlx5_query_ece,
	.query_rt_values = mlx5_query_rt_values,
	.read_counters = mlx5_read_counters,
	.reg_dm_mr = mlx5_reg_dm_mr,
	.alloc_null_mr = mlx5_alloc_null_mr,
	.free_context = mlx5_free_context,
	.set_ece = mlx5_set_ece,
	.unimport_dm = mlx5_unimport_dm,
	.unimport_mr = mlx5_unimport_mr,
	.unimport_pd = mlx5_unimport_pd,
	.query_qp_data_in_order = mlx5_query_qp_data_in_order,
	.alloc_dmah = mlx5_alloc_dmah,
	.dealloc_dmah = mlx5_dealloc_dmah,
	.reg_mr_ex = mlx5_reg_mr_ex,
};

static const struct verbs_context_ops mlx5_ctx_cqev1_ops = {
	.poll_cq = mlx5_poll_cq_v1,
};

static int read_number_from_line(const char *line, int *value)
{
	const char *ptr;

	ptr = strchr(line, ':');
	if (!ptr)
		return 1;

	++ptr;

	*value = atoi(ptr);
	return 0;
}
/**
 * The function looks for the first free user-index in all the
 * user-index tables. If all are used, returns -1, otherwise
 * a valid user-index.
 * In case the reference count of the table is zero, it means the
 * table is not in use and wasn't allocated yet, therefore the
 * mlx5_store_uidx allocates the table, and increment the reference
 * count on the table.
 */
static int32_t get_free_uidx(struct mlx5_context *ctx)
{
	int32_t tind;
	int32_t i;

	for (tind = 0; tind < MLX5_UIDX_TABLE_SIZE; tind++) {
		if (ctx->uidx_table[tind].refcnt < MLX5_UIDX_TABLE_MASK)
			break;
	}

	if (tind == MLX5_UIDX_TABLE_SIZE)
		return -1;

	if (!ctx->uidx_table[tind].refcnt)
		return tind << MLX5_UIDX_TABLE_SHIFT;

	for (i = 0; i < MLX5_UIDX_TABLE_MASK + 1; i++) {
		if (!ctx->uidx_table[tind].table[i])
			break;
	}

	return (tind << MLX5_UIDX_TABLE_SHIFT) | i;
}

int mlx5_cmd_status_to_err(uint8_t status)
{
	switch (status) {
	case MLX5_CMD_STAT_OK:				return 0;
	case MLX5_CMD_STAT_INT_ERR:			return EIO;
	case MLX5_CMD_STAT_BAD_OP_ERR:			return EINVAL;
	case MLX5_CMD_STAT_BAD_PARAM_ERR:		return EINVAL;
	case MLX5_CMD_STAT_BAD_SYS_STATE_ERR:		return EIO;
	case MLX5_CMD_STAT_BAD_RES_ERR:			return EINVAL;
	case MLX5_CMD_STAT_RES_BUSY:			return EBUSY;
	case MLX5_CMD_STAT_LIM_ERR:			return ENOMEM;
	case MLX5_CMD_STAT_BAD_RES_STATE_ERR:		return EINVAL;
	case MLX5_CMD_STAT_IX_ERR:			return EINVAL;
	case MLX5_CMD_STAT_NO_RES_ERR:			return EAGAIN;
	case MLX5_CMD_STAT_BAD_INP_LEN_ERR:		return EIO;
	case MLX5_CMD_STAT_BAD_OUTP_LEN_ERR:		return EIO;
	case MLX5_CMD_STAT_BAD_QP_STATE_ERR:		return EINVAL;
	case MLX5_CMD_STAT_BAD_PKT_ERR:			return EINVAL;
	case MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR:	return EINVAL;
	default:					return EIO;
	}
}

int mlx5_get_cmd_status_err(int err, void *out)
{
	if (err == EREMOTEIO)
		err = mlx5_cmd_status_to_err(DEVX_GET(mbox_out, out, status));
	return err;
}

int32_t mlx5_store_uidx(struct mlx5_context *ctx, void *rsc)
{
	int32_t tind;
	int32_t ret = -1;
	int32_t uidx;

	pthread_mutex_lock(&ctx->uidx_table_mutex);
	uidx = get_free_uidx(ctx);
	if (uidx < 0)
		goto out;

	tind = uidx >> MLX5_UIDX_TABLE_SHIFT;

	if (!ctx->uidx_table[tind].refcnt) {
		ctx->uidx_table[tind].table = calloc(MLX5_UIDX_TABLE_MASK + 1,
						     sizeof(struct mlx5_resource *));
		if (!ctx->uidx_table[tind].table)
			goto out;
	}

	++ctx->uidx_table[tind].refcnt;
	ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = rsc;
	ret = uidx;

out:
	pthread_mutex_unlock(&ctx->uidx_table_mutex);
	return ret;
}

void mlx5_clear_uidx(struct mlx5_context *ctx, uint32_t uidx)
{
	int tind = uidx >> MLX5_UIDX_TABLE_SHIFT;

	pthread_mutex_lock(&ctx->uidx_table_mutex);

	if (!--ctx->uidx_table[tind].refcnt)
		free(ctx->uidx_table[tind].table);
	else
		ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = NULL;

	pthread_mutex_unlock(&ctx->uidx_table_mutex);
}

struct mlx5_mkey *mlx5_find_mkey(struct mlx5_context *ctx, uint32_t mkey)
{
	int tind = mkey >> MLX5_MKEY_TABLE_SHIFT;

	if (ctx->mkey_table[tind].refcnt)
		return ctx->mkey_table[tind].table[mkey & MLX5_MKEY_TABLE_MASK];
	else
		return NULL;
}

int mlx5_store_mkey(struct mlx5_context *ctx, uint32_t mkey,
		    struct mlx5_mkey *mlx5_mkey)
{
	int tind = mkey >> MLX5_MKEY_TABLE_SHIFT;
	int ret = 0;

	pthread_mutex_lock(&ctx->mkey_table_mutex);

	if (!ctx->mkey_table[tind].refcnt) {
		ctx->mkey_table[tind].table = calloc(MLX5_MKEY_TABLE_MASK + 1,
				sizeof(struct mlx5_mkey *));
		if (!ctx->mkey_table[tind].table) {
			ret = -1;
			goto out;
		}
	}

	++ctx->mkey_table[tind].refcnt;
	ctx->mkey_table[tind].table[mkey & MLX5_MKEY_TABLE_MASK] = mlx5_mkey;

out:
	pthread_mutex_unlock(&ctx->mkey_table_mutex);
	return ret;
}

void mlx5_clear_mkey(struct mlx5_context *ctx, uint32_t mkey)
{
	int tind = mkey >> MLX5_MKEY_TABLE_SHIFT;

	pthread_mutex_lock(&ctx->mkey_table_mutex);

	if (!--ctx->mkey_table[tind].refcnt)
		free(ctx->mkey_table[tind].table);
	else
		ctx->mkey_table[tind].table[mkey & MLX5_MKEY_TABLE_MASK] = NULL;

	pthread_mutex_unlock(&ctx->mkey_table_mutex);
}

struct mlx5_psv *mlx5_create_psv(struct ibv_pd *pd)
{
	uint32_t out[DEVX_ST_SZ_DW(create_psv_out)] = {};
	uint32_t in[DEVX_ST_SZ_DW(create_psv_in)] = {};
	struct mlx5_psv *psv;

	psv = calloc(1, sizeof(*psv));
	if (!psv) {
		errno = ENOMEM;
		return NULL;
	}

	DEVX_SET(create_psv_in, in, opcode, MLX5_CMD_OP_CREATE_PSV);
	DEVX_SET(create_psv_in, in, pd, to_mpd(pd)->pdn);
	DEVX_SET(create_psv_in, in, num_psv, 1);

	psv->devx_obj = mlx5dv_devx_obj_create(pd->context, in, sizeof(in),
					       out, sizeof(out));
	if (!psv->devx_obj) {
		errno = mlx5_get_cmd_status_err(errno, out);
		goto err_free_psv;
	}

	psv->index = DEVX_GET(create_psv_out, out, psv0_index);

	return psv;
err_free_psv:
	free(psv);
	return NULL;
}

int mlx5_destroy_psv(struct mlx5_psv *psv)
{
	int ret;

	ret = mlx5dv_devx_obj_destroy(psv->devx_obj);
	if (!ret)
		free(psv);

	return ret;
}

static int mlx5_is_sandy_bridge(int *num_cores)
{
	char line[128];
	FILE *fd;
	int rc = 0;
	int cur_cpu_family = -1;
	int cur_cpu_model = -1;

	fd = fopen("/proc/cpuinfo", "r");
	if (!fd)
		return 0;

	*num_cores = 0;

	while (fgets(line, 128, fd)) {
		int value;

		/* if this is information on new processor */
		if (!strncmp(line, "processor", 9)) {
			++*num_cores;

			cur_cpu_family = -1;
			cur_cpu_model  = -1;
		} else if (!strncmp(line, "cpu family", 10)) {
			if ((cur_cpu_family < 0) && (!read_number_from_line(line, &value)))
				cur_cpu_family = value;
		} else if (!strncmp(line, "model", 5)) {
			if ((cur_cpu_model < 0) && (!read_number_from_line(line, &value)))
				cur_cpu_model = value;
		}

		/* if this is a Sandy Bridge CPU */
		if ((cur_cpu_family == 6) &&
		    (cur_cpu_model == 0x2A || (cur_cpu_model == 0x2D) ))
			rc = 1;
	}

	fclose(fd);
	return rc;
}

/*
man cpuset

  This format displays each 32-bit word in hexadecimal (using ASCII characters "0" - "9" and "a" - "f"); words
  are filled with leading zeros, if required. For masks longer than one word, a comma separator is used between
  words. Words are displayed in big-endian order, which has the most significant bit first. The hex digits
  within a word are also in big-endian order.

  The number of 32-bit words displayed is the minimum number needed to display all bits of the bitmask, based on
  the size of the bitmask.

  Examples of the Mask Format:

     00000001                        # just bit 0 set
     40000000,00000000,00000000      # just bit 94 set
     000000ff,00000000               # bits 32-39 set
     00000000,000E3862               # 1,5,6,11-13,17-19 set

  A mask with bits 0, 1, 2, 4, 8, 16, 32, and 64 set displays as:

     00000001,00000001,00010117

  The first "1" is for bit 64, the second for bit 32, the third for bit 16, the fourth for bit 8, the fifth for
  bit 4, and the "7" is for bits 2, 1, and 0.
*/
static void mlx5_local_cpu_set(struct ibv_device *ibdev, struct mlx5_context *mctx,
			       cpu_set_t *cpu_set)
{
	char *p, buf[1024] = {};
	char *env_value;
	uint32_t word;
	int i, k;

	env_value = getenv("MLX5_LOCAL_CPUS");
	if (env_value)
		strncpy(buf, env_value, sizeof(buf) - 1);
	else {
		char fname[MAXPATHLEN];
		FILE *fp;

		snprintf(fname, MAXPATHLEN, "/sys/class/infiniband/%s/device/local_cpus",
			 ibv_get_device_name(ibdev));

		fp = fopen(fname, "r");
		if (!fp) {
			mlx5_err(mctx->dbg_fp, PFX "Warning: can not get local cpu set: failed to open %s\n", fname);
			return;
		}
		if (!fgets(buf, sizeof(buf), fp)) {
			mlx5_err(mctx->dbg_fp, PFX "Warning: can not get local cpu set: failed to read cpu mask\n");
			fclose(fp);
			return;
		}
		fclose(fp);
	}

	p = strrchr(buf, ',');
	if (!p)
		p = buf;

	i = 0;
	do {
		if (*p == ',') {
			*p = 0;
			p ++;
		}

		word = strtoul(p, NULL, 16);

		for (k = 0; word; ++k, word >>= 1)
			if (word & 1)
				CPU_SET(k+i, cpu_set);

		if (p == buf)
			break;

		p = strrchr(buf, ',');
		if (!p)
			p = buf;

		i += 32;
	} while (i < CPU_SETSIZE);
}

static int mlx5_enable_sandy_bridge_fix(struct ibv_device *ibdev, struct mlx5_context *mctx)
{
	cpu_set_t my_cpus, dev_local_cpus, result_set;
	int stall_enable;
	int ret;
	int num_cores;

	if (!mlx5_is_sandy_bridge(&num_cores))
		return 0;

	/* by default enable stall on sandy bridge arch */
	stall_enable = 1;

	/*
	 * check if app is bound to cpu set that is inside
	 * of device local cpu set. Disable stalling if true
	 */

	/* use static cpu set - up to CPU_SETSIZE (1024) cpus/node */
	CPU_ZERO(&my_cpus);
	CPU_ZERO(&dev_local_cpus);
	CPU_ZERO(&result_set);
	ret = sched_getaffinity(0, sizeof(my_cpus), &my_cpus);
	if (ret == -1) {
		if (errno == EINVAL)
			mlx5_err(mctx->dbg_fp, PFX "Warning: my cpu set is too small\n");
		else
			mlx5_err(mctx->dbg_fp, PFX "Warning: failed to get my cpu set\n");
		goto out;
	}

	/* get device local cpu set */
	mlx5_local_cpu_set(ibdev, mctx, &dev_local_cpus);

	/* check if my cpu set is in dev cpu */
	CPU_OR(&result_set, &my_cpus, &dev_local_cpus);
	stall_enable = CPU_EQUAL(&result_set, &dev_local_cpus) ? 0 : 1;

out:
	return stall_enable;
}

static void mlx5_read_env(struct ibv_device *ibdev, struct mlx5_context *ctx)
{
	char *env_value;

	env_value = getenv("MLX5_STALL_CQ_POLL");
	if (env_value)
		/* check if cq stall is enforced by user */
		ctx->stall_enable = (strcmp(env_value, "0")) ? 1 : 0;
	else
		/* autodetect if we need to do cq polling */
		ctx->stall_enable = mlx5_enable_sandy_bridge_fix(ibdev, ctx);

	env_value = getenv("MLX5_STALL_NUM_LOOP");
	if (env_value)
		mlx5_stall_num_loop = atoi(env_value);

	env_value = getenv("MLX5_STALL_CQ_POLL_MIN");
	if (env_value)
		mlx5_stall_cq_poll_min = atoi(env_value);

	env_value = getenv("MLX5_STALL_CQ_POLL_MAX");
	if (env_value)
		mlx5_stall_cq_poll_max = atoi(env_value);

	env_value = getenv("MLX5_STALL_CQ_INC_STEP");
	if (env_value)
		mlx5_stall_cq_inc_step = atoi(env_value);

	env_value = getenv("MLX5_STALL_CQ_DEC_STEP");
	if (env_value)
		mlx5_stall_cq_dec_step = atoi(env_value);

	ctx->stall_adaptive_enable = 0;
	ctx->stall_cycles = 0;

	if (mlx5_stall_num_loop < 0) {
		ctx->stall_adaptive_enable = 1;
		ctx->stall_cycles = mlx5_stall_cq_poll_min;
	}

}

static int get_total_uuars(int page_size)
{
	int size = MLX5_DEF_TOT_UUARS;
	int uuars_in_page;
	char *env;

	env = getenv("MLX5_TOTAL_UUARS");
	if (env)
		size = atoi(env);

	if (size < 1)
		return -EINVAL;

	uuars_in_page = page_size / MLX5_ADAPTER_PAGE_SIZE * MLX5_NUM_NON_FP_BFREGS_PER_UAR;
	size = max(uuars_in_page, size);
	size = align(size, MLX5_NUM_NON_FP_BFREGS_PER_UAR);
	if (size > MLX5_MAX_BFREGS)
		return -ENOMEM;

	return size;
}

void mlx5_open_debug_file(FILE **dbg_fp)
{
	char *env;
	FILE *default_dbg_fp = NULL;

#ifdef MLX5_DEBUG
	default_dbg_fp = stderr;
#endif

	env = getenv("MLX5_DEBUG_FILE");
	if (!env) {
		*dbg_fp = default_dbg_fp;
		return;
	}

	*dbg_fp = fopen(env, "aw+");
	if (!*dbg_fp) {
		*dbg_fp = default_dbg_fp;
		mlx5_err(*dbg_fp, "Failed opening debug file %s\n", env);
		return;
	}
}

void mlx5_close_debug_file(FILE *dbg_fp)
{
	if (dbg_fp && dbg_fp != stderr)
		fclose(dbg_fp);
}

void mlx5_set_debug_mask(void)
{
	char *env;

	env = getenv("MLX5_DEBUG_MASK");
	if (env)
		mlx5_debug_mask = strtol(env, NULL, 0);
}

static void set_freeze_on_error(void)
{
	char *env;

	env = getenv("MLX5_FREEZE_ON_ERROR_CQE");
	if (env)
		mlx5_freeze_on_error_cqe = strtol(env, NULL, 0);
}

static int get_always_bf(void)
{
	char *env;

	env = getenv("MLX5_POST_SEND_PREFER_BF");
	if (!env)
		return 1;

	return strcmp(env, "0") ? 1 : 0;
}

static int get_shut_up_bf(void)
{
	char *env;

	env = getenv("MLX5_SHUT_UP_BF");
	if (!env)
		return 0;

	return strcmp(env, "0") ? 1 : 0;
}

static int get_num_low_lat_uuars(int tot_uuars)
{
	char *env;
	int num = 4;

	env = getenv("MLX5_NUM_LOW_LAT_UUARS");
	if (env)
		num = atoi(env);

	if (num < 0)
		return -EINVAL;

	num = max(num, tot_uuars - MLX5_MED_BFREGS_TSHOLD);
	return num;
}

/* The library allocates an array of uuar contexts. The one in index zero does
 * not to execersize odd/even policy so it can avoid a lock but it may not use
 * blue flame. The upper ones, low_lat_uuars can use blue flame with no lock
 * since they are assigned to one QP only. The rest can use blue flame but since
 * they are shared they need a lock
 */
static int need_uuar_lock(struct mlx5_context *ctx, int uuarn)
{
	int i;

	if (uuarn == 0 || mlx5_single_threaded)
		return 0;

	i = (uuarn / 2) + (uuarn % 2);
	if (i >= ctx->tot_uuars - ctx->low_lat_uuars)
		return 0;

	return 1;
}

static int single_threaded_app(void)
{

	char *env;

	env = getenv("MLX5_SINGLE_THREADED");
	if (env)
		return strcmp(env, "1") ? 0 : 1;

	return 0;
}

static int mlx5_cmd_get_context(struct mlx5_context *context,
				struct mlx5_alloc_ucontext *req,
				size_t req_len,
				struct ibv_fd_arr *fds,
				struct mlx5_alloc_ucontext_resp *resp,
				size_t resp_len)
{
	struct verbs_context *verbs_ctx = &context->ibv_ctx;

	if (!ibv_cmd_get_context(verbs_ctx, &req->ibv_cmd,
				 req_len, fds, &resp->ibv_resp,
				 resp_len))
		return 0;

	/* The ibv_cmd_get_context fails in older kernels when passing
	 * a request length that the kernel doesn't know.
	 * To avoid breaking compatibility of new libmlx5 and older
	 * kernels, when ibv_cmd_get_context fails with the full
	 * request length, we try once again with the legacy length.
	 * We repeat this process while reducing requested size based
	 * on the feature input size. To avoid this in the future, we
	 * will remove the check in kernel that requires fields unknown
	 * to the kernel to be cleared. This will require that any new
	 * feature that involves extending struct mlx5_alloc_ucontext
	 * will be accompanied by an indication in the form of one or
	 * more fields in struct mlx5_alloc_ucontext_resp. If the
	 * response value can be interpreted as feature not supported
	 * when the returned value is zero, this will suffice to
	 * indicate to the library that the request was ignored by the
	 * kernel, either because it is unaware or because it decided
	 * to do so. If zero is a valid response, we will add a new
	 * field that indicates whether the request was handled.
	 */
	if (!ibv_cmd_get_context(verbs_ctx, &req->ibv_cmd,
				 offsetof(struct mlx5_alloc_ucontext, lib_caps),
				 fds, &resp->ibv_resp, resp_len))
		return 0;

	return ibv_cmd_get_context(verbs_ctx, &req->ibv_cmd,
				   offsetof(struct mlx5_alloc_ucontext,
					    max_cqe_version),
				   fds, &resp->ibv_resp, resp_len);
}

static int mlx5_map_internal_clock(struct mlx5_device *mdev,
				   struct ibv_context *ibv_ctx)
{
	struct mlx5_context *context = to_mctx(ibv_ctx);
	void *hca_clock_page;
	off_t offset = 0;

	set_command(MLX5_IB_MMAP_CORE_CLOCK, &offset);
	hca_clock_page = mmap(NULL, mdev->page_size,
			      PROT_READ, MAP_SHARED, ibv_ctx->cmd_fd,
			      mdev->page_size * offset);

	if (hca_clock_page == MAP_FAILED) {
		mlx5_err(context->dbg_fp, PFX
			 "Warning: Timestamp available,\n"
			 "but failed to mmap() hca core clock page.\n");
		return -1;
	}

	context->hca_core_clock = hca_clock_page +
		(context->core_clock.offset & (mdev->page_size - 1));
	return 0;
}

static void mlx5_map_clock_info(struct mlx5_device *mdev,
				struct ibv_context *ibv_ctx)
{
	struct mlx5_context *context = to_mctx(ibv_ctx);
	void *clock_info_page;
	off_t offset = 0;

	set_command(MLX5_IB_MMAP_CLOCK_INFO, &offset);
	set_index(MLX5_IB_CLOCK_INFO_V1, &offset);
	clock_info_page = mmap(NULL, mdev->page_size,
			       PROT_READ, MAP_SHARED, ibv_ctx->cmd_fd,
			       offset * mdev->page_size);

	if (clock_info_page != MAP_FAILED)
		context->clock_info_page = clock_info_page;
}

static uint32_t get_dc_odp_caps(struct ibv_context *ctx)
{
	uint32_t in[DEVX_ST_SZ_DW(query_hca_cap_in)] = {};
	uint32_t out[DEVX_ST_SZ_DW(query_hca_cap_out)] = {};
	uint16_t opmod = (MLX5_CAP_ODP << 1) | HCA_CAP_OPMOD_GET_CUR;
	uint32_t ret;

	DEVX_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
	DEVX_SET(query_hca_cap_in, in, op_mod, opmod);

	ret = mlx5dv_devx_general_cmd(ctx, in, sizeof(in), out, sizeof(out));
	if (ret)
		return 0;

	if (DEVX_GET(query_hca_cap_out, out,
		     capability.odp_cap.dc_odp_caps.send))
		ret |= IBV_ODP_SUPPORT_SEND;
	if (DEVX_GET(query_hca_cap_out, out,
		     capability.odp_cap.dc_odp_caps.receive))
		ret |= IBV_ODP_SUPPORT_RECV;
	if (DEVX_GET(query_hca_cap_out, out,
		     capability.odp_cap.dc_odp_caps.write))
		ret |= IBV_ODP_SUPPORT_WRITE;
	if (DEVX_GET(query_hca_cap_out, out,
		     capability.odp_cap.dc_odp_caps.read))
		ret |= IBV_ODP_SUPPORT_READ;
	if (DEVX_GET(query_hca_cap_out, out,
		     capability.odp_cap.dc_odp_caps.atomic))
		ret |= IBV_ODP_SUPPORT_ATOMIC;
	if (DEVX_GET(query_hca_cap_out, out,
		     capability.odp_cap.dc_odp_caps.srq_receive))
		ret |= IBV_ODP_SUPPORT_SRQ_RECV;

	return ret;
}

static int _mlx5dv_query_device(struct ibv_context *ctx_in,
				struct mlx5dv_context *attrs_out)
{
	struct mlx5_context *mctx = to_mctx(ctx_in);
	uint64_t comp_mask_out = 0;

	attrs_out->version   = 0;
	attrs_out->flags     = 0;

	if (mctx->cqe_version == MLX5_CQE_VERSION_V1)
		attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_CQE_V1;

	if (mctx->vendor_cap_flags & MLX5_VENDOR_CAP_FLAGS_MPW_ALLOWED)
		attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED;

	if (mctx->vendor_cap_flags & MLX5_VENDOR_CAP_FLAGS_CQE_128B_COMP)
		attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP;

	if (mctx->vendor_cap_flags & MLX5_VENDOR_CAP_FLAGS_CQE_128B_PAD)
		attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_CQE_128B_PAD;

	if (mctx->flags & MLX5_CTX_FLAGS_REAL_TIME_TS_SUPPORTED)
		attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_REAL_TIME_TS;

	if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_CQE_COMPRESION) {
		attrs_out->cqe_comp_caps = mctx->cqe_comp_caps;
		comp_mask_out |= MLX5DV_CONTEXT_MASK_CQE_COMPRESION;
	}

	if (mctx->vendor_cap_flags & MLX5_VENDOR_CAP_FLAGS_ENHANCED_MPW)
		attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW;

	if (mctx->vendor_cap_flags &
		MLX5_VENDOR_CAP_FLAGS_PACKET_BASED_CREDIT_MODE)
		attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_PACKET_BASED_CREDIT_MODE;

	if (mctx->bf_reg_size > 0)
		attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_BLUEFLAME;

	if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_SWP) {
		attrs_out->sw_parsing_caps = mctx->sw_parsing_caps;
		comp_mask_out |= MLX5DV_CONTEXT_MASK_SWP;
	}

	if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
		attrs_out->striding_rq_caps = mctx->striding_rq_caps;
		comp_mask_out |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
	}

	if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
		attrs_out->tunnel_offloads_caps = mctx->tunnel_offloads_caps;
		comp_mask_out |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
	}

	if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_DCI_STREAMS) {
		attrs_out->dci_streams_caps = mctx->dci_streams_caps;
		comp_mask_out |= MLX5DV_CONTEXT_MASK_DCI_STREAMS;
	}

	if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_DYN_BFREGS) {
		attrs_out->max_dynamic_bfregs = mctx->num_dyn_bfregs;
		comp_mask_out |= MLX5DV_CONTEXT_MASK_DYN_BFREGS;
	}

	if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_CLOCK_INFO_UPDATE) {
		if (mctx->clock_info_page) {
			attrs_out->max_clock_info_update_nsec =
					mctx->clock_info_page->overflow_period;
			comp_mask_out |= MLX5DV_CONTEXT_MASK_CLOCK_INFO_UPDATE;
		}
	}

	if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_FLOW_ACTION_FLAGS) {
		attrs_out->flow_action_flags = mctx->flow_action_flags;
		comp_mask_out |= MLX5DV_CONTEXT_MASK_FLOW_ACTION_FLAGS;
	}

	if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_DC_ODP_CAPS) {
		attrs_out->dc_odp_caps = get_dc_odp_caps(ctx_in);
		comp_mask_out |= MLX5DV_CONTEXT_MASK_DC_ODP_CAPS;
	}

	if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_HCA_CORE_CLOCK) {
		if (mctx->hca_core_clock) {
			attrs_out->hca_core_clock = mctx->hca_core_clock;
			comp_mask_out |= MLX5DV_CONTEXT_MASK_HCA_CORE_CLOCK;
		}
	}

	if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_NUM_LAG_PORTS) {
		if (mctx->entropy_caps.num_lag_ports) {
			attrs_out->num_lag_ports =
				mctx->entropy_caps.num_lag_ports;
			comp_mask_out |= MLX5DV_CONTEXT_MASK_NUM_LAG_PORTS;
		}
	}

	if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_SIGNATURE_OFFLOAD) {
		attrs_out->sig_caps = mctx->sig_caps;
		comp_mask_out |= MLX5DV_CONTEXT_MASK_SIGNATURE_OFFLOAD;
	}

	if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_WR_MEMCPY_LENGTH) {
		attrs_out->max_wr_memcpy_length =
			mctx->dma_mmo_caps.dma_max_size;
		comp_mask_out |= MLX5DV_CONTEXT_MASK_WR_MEMCPY_LENGTH;
	}

	if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_CRYPTO_OFFLOAD) {
		attrs_out->crypto_caps = mctx->crypto_caps;
		comp_mask_out |= MLX5DV_CONTEXT_MASK_CRYPTO_OFFLOAD;
	}

	if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_MAX_DC_RD_ATOM) {
		attrs_out->max_dc_rd_atom = mctx->max_dc_rd_atom;
		attrs_out->max_dc_init_rd_atom = mctx->max_dc_init_rd_atom;
		comp_mask_out |= MLX5DV_CONTEXT_MASK_MAX_DC_RD_ATOM;
	}

	if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_REG_C0) {
		if (mctx->reg_c0.mask) {
			attrs_out->reg_c0 = mctx->reg_c0;
			comp_mask_out |= MLX5DV_CONTEXT_MASK_REG_C0;
		}
	}

	if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_OOO_RECV_WRS) {
		if (mctx->vendor_cap_flags & MLX5_VENDOR_CAP_FLAGS_OOO_DP) {
			attrs_out->ooo_recv_wrs_caps = mctx->ooo_recv_wrs_caps;
			comp_mask_out |= MLX5DV_CONTEXT_MASK_OOO_RECV_WRS;
		}
	}

	attrs_out->comp_mask = comp_mask_out;

	return 0;
}

int mlx5dv_query_device(struct ibv_context *ctx_in,
			struct mlx5dv_context *attrs_out)
{
	struct mlx5_dv_context_ops *dvops = mlx5_get_dv_ops(ctx_in);

	if (!dvops || !dvops->query_device)
		return EOPNOTSUPP;

	return dvops->query_device(ctx_in, attrs_out);
}

static int mlx5dv_get_qp(struct ibv_qp *qp_in,
			 struct mlx5dv_qp *qp_out)
{
	struct mlx5_qp *mqp = to_mqp(qp_in);
	uint64_t mask_out = 0;

	qp_out->dbrec     = mqp->db;

	if (mqp->sq_buf_size)
		/* IBV_QPT_RAW_PACKET */
		qp_out->sq.buf = (void *)((uintptr_t)mqp->sq_buf.buf);
	else
		qp_out->sq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->sq.offset);
	qp_out->sq.wqe_cnt = mqp->sq.wqe_cnt;
	qp_out->sq.stride  = 1 << mqp->sq.wqe_shift;

	qp_out->rq.buf     = (void *)((uintptr_t)mqp->buf.buf + mqp->rq.offset);
	qp_out->rq.wqe_cnt = mqp->rq.wqe_cnt;
	qp_out->rq.stride  = 1 << mqp->rq.wqe_shift;

	qp_out->bf.reg	   = mqp->bf->reg;

	if (qp_out->comp_mask & MLX5DV_QP_MASK_UAR_MMAP_OFFSET) {
		qp_out->uar_mmap_offset = mqp->bf->uar_mmap_offset;
		mask_out |= MLX5DV_QP_MASK_UAR_MMAP_OFFSET;
	}

	if (qp_out->comp_mask & MLX5DV_QP_MASK_RAW_QP_HANDLES) {
		qp_out->tirn = mqp->tirn;
		qp_out->tisn = mqp->tisn;
		qp_out->rqn = mqp->rqn;
		qp_out->sqn = mqp->sqn;
		mask_out |= MLX5DV_QP_MASK_RAW_QP_HANDLES;
	}

	if (qp_out->comp_mask & MLX5DV_QP_MASK_RAW_QP_TIR_ADDR) {
		qp_out->tir_icm_addr = mqp->tir_icm_addr;
		mask_out |= MLX5DV_QP_MASK_RAW_QP_TIR_ADDR;
	}

	if (mqp->bf->uuarn > 0)
		qp_out->bf.size = mqp->bf->buf_size;
	else
		qp_out->bf.size = 0;

	qp_out->comp_mask = mask_out;

	return 0;
}

static int mlx5dv_get_cq(struct ibv_cq *cq_in,
			 struct mlx5dv_cq *cq_out)
{
	struct mlx5_cq *mcq = to_mcq(cq_in);
	struct mlx5_context *mctx = to_mctx(cq_in->context);

	cq_out->comp_mask = 0;
	cq_out->cqn       = mcq->cqn;
	cq_out->cqe_cnt   = mcq->verbs_cq.cq.cqe + 1;
	cq_out->cqe_size  = mcq->cqe_sz;
	cq_out->buf       = mcq->active_buf->buf;
	cq_out->dbrec     = mcq->dbrec;
	cq_out->cq_uar	  = mctx->cq_uar_reg;

	mcq->flags	 |= MLX5_CQ_FLAGS_DV_OWNED;

	return 0;
}

static int mlx5dv_get_rwq(struct ibv_wq *wq_in,
			  struct mlx5dv_rwq *rwq_out)
{
	struct mlx5_rwq *mrwq = to_mrwq(wq_in);

	rwq_out->comp_mask = 0;
	rwq_out->buf       = mrwq->pbuff;
	rwq_out->dbrec     = mrwq->recv_db;
	rwq_out->wqe_cnt   = mrwq->rq.wqe_cnt;
	rwq_out->stride    = 1 << mrwq->rq.wqe_shift;

	return 0;
}

static int mlx5dv_get_srq(struct ibv_srq *srq_in,
			  struct mlx5dv_srq *srq_out)
{
	struct mlx5_srq *msrq;
	uint64_t mask_out = 0;

	msrq = container_of(srq_in, struct mlx5_srq, vsrq.srq);

	srq_out->buf       = msrq->buf.buf;
	srq_out->dbrec     = msrq->db;
	srq_out->stride    = 1 << msrq->wqe_shift;
	srq_out->head      = msrq->head;
	srq_out->tail      = msrq->tail;

	if (srq_out->comp_mask & MLX5DV_SRQ_MASK_SRQN) {
		srq_out->srqn = msrq->srqn;
		mask_out |= MLX5DV_SRQ_MASK_SRQN;
	}

	srq_out->comp_mask = mask_out;
	return 0;
}

static int mlx5dv_get_dm(struct ibv_dm *dm_in,
			 struct mlx5dv_dm *dm_out)
{
	struct mlx5_dm *mdm = to_mdm(dm_in);
	uint64_t mask_out = 0;

	dm_out->buf       = mdm->start_va;
	dm_out->length    = mdm->length;

	if (dm_out->comp_mask & MLX5DV_DM_MASK_REMOTE_VA) {
		dm_out->remote_va = mdm->remote_va;
		mask_out |= MLX5DV_DM_MASK_REMOTE_VA;
	}

	dm_out->comp_mask = mask_out;

	return 0;
}

static int mlx5dv_get_av(struct ibv_ah *ah_in,
			 struct mlx5dv_ah *ah_out)
{
	struct mlx5_ah *mah = to_mah(ah_in);

	ah_out->comp_mask = 0;
	ah_out->av	  = &mah->av;

	return 0;
}

static int mlx5dv_get_pd(struct ibv_pd *pd_in,
			 struct mlx5dv_pd *pd_out)
{
	struct mlx5_pd *mpd = to_mpd(pd_in);

	pd_out->comp_mask = 0;
	pd_out->pdn = mpd->pdn;

	return 0;
}

static int mlx5dv_get_devx(struct mlx5dv_devx_obj *devx_in,
			   struct mlx5dv_devx *devx_out)
{
	devx_out->handle = devx_in->handle;

	return 0;
}

static int query_lag(struct ibv_context *ctx, uint8_t *lag_state,
		     uint8_t *tx_remap_affinity_1,
		     uint8_t *tx_remap_affinity_2)
{
	uint32_t out_lag[DEVX_ST_SZ_DW(query_lag_out)] = {};
	uint32_t in_lag[DEVX_ST_SZ_DW(query_lag_in)] = {};
	int ret;

	DEVX_SET(query_lag_in, in_lag, opcode, MLX5_CMD_OP_QUERY_LAG);
	ret = mlx5dv_devx_general_cmd(ctx, in_lag, sizeof(in_lag), out_lag,
				      sizeof(out_lag));
	if (ret)
		return mlx5_get_cmd_status_err(ret, out_lag);

	*lag_state = DEVX_GET(query_lag_out, out_lag, ctx.lag_state);
	if (tx_remap_affinity_1)
		*tx_remap_affinity_1 = DEVX_GET(query_lag_out, out_lag,
						ctx.tx_remap_affinity_1);
	if (tx_remap_affinity_2)
		*tx_remap_affinity_2 = DEVX_GET(query_lag_out, out_lag,
						ctx.tx_remap_affinity_2);

	return 0;
}

static bool lag_operation_supported(struct ibv_qp *qp)
{
	struct mlx5_context *mctx = to_mctx(qp->context);
	struct mlx5_qp *mqp = to_mqp(qp);

	if (mctx->entropy_caps.num_lag_ports <= 1)
		return false;

	if ((qp->qp_type == IBV_QPT_RC) ||
	    (qp->qp_type == IBV_QPT_UD) ||
	    (qp->qp_type == IBV_QPT_UC) ||
	    (qp->qp_type == IBV_QPT_RAW_PACKET) ||
	    (qp->qp_type == IBV_QPT_XRC_SEND) ||
	    ((qp->qp_type == IBV_QPT_DRIVER) &&
	     (mqp->dc_type == MLX5DV_DCTYPE_DCI)))
		return true;

	return false;
}


static int _mlx5dv_query_qp_lag_port(struct ibv_qp *qp, uint8_t *port_num,
				     uint8_t *active_port_num)
{
	uint8_t lag_state = 0, tx_remap_affinity_1 = 0, tx_remap_affinity_2 = 0;
	uint32_t in_tis[DEVX_ST_SZ_DW(query_tis_in)] = {};
	uint32_t out_tis[DEVX_ST_SZ_DW(query_tis_out)] = {};
	uint32_t in_qp[DEVX_ST_SZ_DW(query_qp_in)] = {};
	uint32_t out_qp[DEVX_ST_SZ_DW(query_qp_out)] = {};
	struct mlx5_context *mctx = to_mctx(qp->context);
	struct mlx5_qp *mqp = to_mqp(qp);
	int ret;

	if (!lag_operation_supported(qp))
		return EOPNOTSUPP;

	ret = query_lag(qp->context, &lag_state,
			&tx_remap_affinity_1, &tx_remap_affinity_2);
	if (ret)
		return ret;

	if (!lag_state && !mctx->entropy_caps.lag_tx_port_affinity)
		return EOPNOTSUPP;

	switch (qp->qp_type) {
	case IBV_QPT_RAW_PACKET:
		DEVX_SET(query_tis_in, in_tis, opcode, MLX5_CMD_OP_QUERY_TIS);
		DEVX_SET(query_tis_in, in_tis, tisn, mqp->tisn);
		ret = mlx5dv_devx_qp_query(qp, in_tis, sizeof(in_tis), out_tis,
					   sizeof(out_tis));
		if (ret)
			return mlx5_get_cmd_status_err(ret, out_tis);

		*port_num = DEVX_GET(query_tis_out, out_tis,
				     tis_context.lag_tx_port_affinity);
		break;

	default:
		DEVX_SET(query_qp_in, in_qp, opcode, MLX5_CMD_OP_QUERY_QP);
		DEVX_SET(query_qp_in, in_qp, qpn, qp->qp_num);
		ret = mlx5dv_devx_qp_query(qp, in_qp, sizeof(in_qp), out_qp,
					   sizeof(out_qp));
		if (ret)
			return mlx5_get_cmd_status_err(ret, out_qp);

		*port_num = DEVX_GET(query_qp_out, out_qp,
				     qpc.lag_tx_port_affinity);
		break;
	}

	switch (*port_num) {
	case 1:
		*active_port_num = tx_remap_affinity_1;
		break;

	case 2:
		*active_port_num = tx_remap_affinity_2;
		break;

	default:
		return EOPNOTSUPP;
	}

	return 0;
}

int mlx5dv_query_qp_lag_port(struct ibv_qp *qp, uint8_t *port_num,
			     uint8_t *active_port_num)
{
	struct mlx5_dv_context_ops *dvops = mlx5_get_dv_ops(qp->context);

	if (!dvops || !dvops->query_qp_lag_port)
		return EOPNOTSUPP;

	return dvops->query_qp_lag_port(qp, port_num,
					active_port_num);
}

static int modify_tis_lag_port(struct ibv_qp *qp, uint8_t port_num)
{
	uint32_t out[DEVX_ST_SZ_DW(modify_tis_out)] = {};
	uint32_t in[DEVX_ST_SZ_DW(modify_tis_in)] = {};
	struct mlx5_qp *mqp = to_mqp(qp);
	int ret;

	DEVX_SET(modify_tis_in, in, opcode, MLX5_CMD_OP_MODIFY_TIS);
	DEVX_SET(modify_tis_in, in, tisn, mqp->tisn);
	DEVX_SET(modify_tis_in, in, bitmask.lag_tx_port_affinity, 1);
	DEVX_SET(modify_tis_in, in, ctx.lag_tx_port_affinity, port_num);
	ret = mlx5dv_devx_qp_modify(qp, in, sizeof(in), out, sizeof(out));
	return ret ? mlx5_get_cmd_status_err(ret, out) : 0;
}

static int modify_qp_lag_port(struct ibv_qp *qp, uint8_t port_num)
{
	uint32_t out[DEVX_ST_SZ_DW(rts2rts_qp_out)] = {};
	uint32_t in[DEVX_ST_SZ_DW(rts2rts_qp_in)] = {};
	struct mlx5_context *mctx = to_mctx(qp->context);
	int ret;

	if (!mctx->entropy_caps.rts2rts_lag_tx_port_affinity ||
	    qp->state != IBV_QPS_RTS)
		return EOPNOTSUPP;

	DEVX_SET(rts2rts_qp_in, in, opcode, MLX5_CMD_OP_RTS2RTS_QP);
	DEVX_SET(rts2rts_qp_in, in, qpn, qp->qp_num);
	DEVX_SET(rts2rts_qp_in, in, opt_param_mask,
		 MLX5_QPC_OPT_MASK_RTS2RTS_LAG_TX_PORT_AFFINITY);
	DEVX_SET(rts2rts_qp_in, in, qpc.lag_tx_port_affinity, port_num);
	ret = mlx5dv_devx_qp_modify(qp, in, sizeof(in), out, sizeof(out));
	return ret ? mlx5_get_cmd_status_err(ret, out) : 0;
}

static int _mlx5dv_modify_qp_lag_port(struct ibv_qp *qp, uint8_t port_num)
{
	uint8_t curr_configured, curr_active;
	struct mlx5_qp *mqp = to_mqp(qp);
	int ret;

	/* Query lag port to see if we are at all in lag mode, otherwise FW
	 * might return success and ignore the modification.
	 */
	ret = mlx5dv_query_qp_lag_port(qp, &curr_configured, &curr_active);
	if (ret)
		return ret;

	switch (qp->qp_type) {
	case IBV_QPT_RAW_PACKET:
		return modify_tis_lag_port(qp, port_num);

	case IBV_QPT_DRIVER:
		if (mqp->dc_type != MLX5DV_DCTYPE_DCI)
			return EOPNOTSUPP;
		SWITCH_FALLTHROUGH;
	case IBV_QPT_RC:
	case IBV_QPT_UD:
	case IBV_QPT_UC:
		return modify_qp_lag_port(qp, port_num);

	default:
		return EOPNOTSUPP;
	}
}

int mlx5dv_modify_qp_lag_port(struct ibv_qp *qp, uint8_t port_num)
{
	struct mlx5_dv_context_ops *dvops = mlx5_get_dv_ops(qp->context);

	if (!dvops || !dvops->modify_qp_lag_port)
		return EOPNOTSUPP;

	return dvops->modify_qp_lag_port(qp, port_num);

}

static int _mlx5dv_modify_qp_udp_sport(struct ibv_qp *qp, uint16_t udp_sport)
{
	uint32_t in[DEVX_ST_SZ_DW(rts2rts_qp_in)] = {};
	uint32_t out[DEVX_ST_SZ_DW(rts2rts_qp_out)] = {};
	struct mlx5_context *mctx = to_mctx(qp->context);
	int ret;

	switch (qp->qp_type) {
	case IBV_QPT_RC:
	case IBV_QPT_UC:
		if (qp->state != IBV_QPS_RTS ||
		    !mctx->entropy_caps.rts2rts_qp_udp_sport)
			return EOPNOTSUPP;
		break;
	default:
		return EOPNOTSUPP;
	}
	DEVX_SET(rts2rts_qp_in, in, opcode, MLX5_CMD_OP_RTS2RTS_QP);
	DEVX_SET(rts2rts_qp_in, in, qpn, qp->qp_num);
	DEVX_SET64(rts2rts_qp_in, in, opt_param_mask_95_32,
		   MLX5_QPC_OPT_MASK_32_UDP_SPORT);
	DEVX_SET(rts2rts_qp_in, in, qpc.primary_address_path.udp_sport,
		 udp_sport);

	ret = mlx5dv_devx_qp_modify(qp, in, sizeof(in), out, sizeof(out));
	return ret ? mlx5_get_cmd_status_err(ret, out) : 0;
}

int mlx5dv_modify_qp_udp_sport(struct ibv_qp *qp, uint16_t udp_sport)
{
	struct mlx5_dv_context_ops *dvops = mlx5_get_dv_ops(qp->context);

	if (!dvops || !dvops->modify_qp_udp_sport)
		return EOPNOTSUPP;

	return dvops->modify_qp_udp_sport(qp, udp_sport);
}

int mlx5dv_dci_stream_id_reset(struct ibv_qp *qp, uint16_t stream_id)
{
	uint32_t out[DEVX_ST_SZ_DW(rts2rts_qp_out)] = {};
	uint32_t in[DEVX_ST_SZ_DW(rts2rts_qp_in)] = {};
	struct mlx5_context *mctx = to_mctx(qp->context);
	struct mlx5_qp *mqp = to_mqp(qp);
	void *qpce = DEVX_ADDR_OF(rts2rts_qp_in, in, qpc_data_ext);
	int ret;

	if (!is_mlx5_dev(qp->context->device) ||
	    !mctx->dci_streams_caps.max_log_num_errored ||
	    !mctx->qpc_extension_cap ||
	    qp->state != IBV_QPS_RTS)
		return EOPNOTSUPP;

	if ((mqp->dc_type != MLX5DV_DCTYPE_DCI) || (qp->qp_type != IBV_QPT_DRIVER))
		return EINVAL;

	DEVX_SET(rts2rts_qp_in, in, opcode, MLX5_CMD_OP_RTS2RTS_QP);
	DEVX_SET(rts2rts_qp_in, in, qpn, qp->qp_num);
	DEVX_SET(rts2rts_qp_in, in, qpc_ext, 1);
	DEVX_SET64(rts2rts_qp_in, in, opt_param_mask_95_32,
		   MLX5_QPC_OPT_MASK_32_DCI_STREAM_CHANNEL_ID);

	DEVX_SET(qpc_ext, qpce, dci_stream_channel_id, stream_id);

	ret = mlx5dv_devx_qp_modify(qp, in, sizeof(in), out, sizeof(out));
	return ret ? mlx5_get_cmd_status_err(ret, out) : 0;
}

static bool sched_supported(struct ibv_context *ctx)
{
	struct mlx5_qos_caps *qc = &to_mctx(ctx)->qos_caps;

	return (qc->qos &&
		(qc->nic_element_type & ELEMENT_TYPE_CAP_MASK_TASR) &&
		(qc->nic_element_type & ELEMENT_TYPE_CAP_MASK_QUEUE_GROUP) &&
		(qc->nic_tsar_type & TSAR_TYPE_CAP_MASK_DWRR));
}

static struct mlx5dv_devx_obj *
mlx5dv_sched_nic_create(struct ibv_context *ctx,
			const struct mlx5dv_sched_attr *sched_attr,
			int elem_type)
{
	uint32_t out[DEVX_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
	uint32_t in[DEVX_ST_SZ_DW(create_sched_elem_in)] = {};
	struct mlx5dv_devx_obj *obj;
	uint32_t parent_id;
	void *attr;

	attr = DEVX_ADDR_OF(create_sched_elem_in, in, hdr);
	DEVX_SET(general_obj_in_cmd_hdr,
		 attr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
	DEVX_SET(general_obj_in_cmd_hdr,
		 attr, obj_type, MLX5_OBJ_TYPE_SCHEDULING_ELEMENT);

	attr = DEVX_ADDR_OF(create_sched_elem_in, in, sched_elem);
	DEVX_SET64(sched_elem, attr, modify_field_select, sched_attr->flags);
	DEVX_SET(sched_elem, attr,
		 scheduling_hierarchy, MLX5_SCHED_HIERARCHY_NIC);

	attr = DEVX_ADDR_OF(create_sched_elem_in, in, sched_elem.sched_context);
	DEVX_SET(sched_context, attr, element_type, elem_type);

	parent_id = sched_attr->parent ? sched_attr->parent->obj->object_id : 0;
	DEVX_SET(sched_context, attr, parent_element_id, parent_id);
	if (sched_attr->flags & MLX5DV_SCHED_ELEM_ATTR_FLAGS_BW_SHARE)
		DEVX_SET(sched_context, attr, bw_share, sched_attr->bw_share);
	if (sched_attr->flags & MLX5DV_SCHED_ELEM_ATTR_FLAGS_MAX_AVG_BW)
		DEVX_SET(sched_context, attr,
			 max_average_bw, sched_attr->max_avg_bw);

	attr = DEVX_ADDR_OF(create_sched_elem_in, in,
			    sched_elem.sched_context.sched_elem_attr);
	DEVX_SET(sched_elem_attr_tsar, attr, tsar_type,
		 MLX5_SCHED_TSAR_TYPE_DWRR);

	obj = mlx5dv_devx_obj_create(ctx, in, sizeof(in), out, sizeof(out));
	if (!obj)
		errno = mlx5_get_cmd_status_err(errno, out);
	return obj;
}

static int
mlx5dv_sched_nic_modify(struct mlx5dv_devx_obj *obj,
			const struct mlx5dv_sched_attr *sched_attr,
			int elem_type)
{
	uint32_t out[DEVX_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
	uint32_t in[DEVX_ST_SZ_DW(create_sched_elem_in)] = {};
	void *attr;
	int ret;

	attr = DEVX_ADDR_OF(create_sched_elem_in, in, hdr);
	DEVX_SET(general_obj_in_cmd_hdr,
		 attr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
	DEVX_SET(general_obj_in_cmd_hdr,
		 attr, obj_type, MLX5_OBJ_TYPE_SCHEDULING_ELEMENT);
	DEVX_SET(general_obj_in_cmd_hdr, in, obj_id, obj->object_id);

	attr = DEVX_ADDR_OF(create_sched_elem_in, in, sched_elem);
	DEVX_SET64(sched_elem, attr, modify_field_select, sched_attr->flags);
	DEVX_SET(sched_elem, attr,
		 scheduling_hierarchy, MLX5_SCHED_HIERARCHY_NIC);

	attr = DEVX_ADDR_OF(create_sched_elem_in, in, sched_elem.sched_context);
	DEVX_SET(sched_context, attr, element_type, elem_type);
	if (sched_attr->flags & MLX5DV_SCHED_ELEM_ATTR_FLAGS_BW_SHARE)
		DEVX_SET(sched_context, attr, bw_share, sched_attr->bw_share);
	if (sched_attr->flags & MLX5DV_SCHED_ELEM_ATTR_FLAGS_MAX_AVG_BW)
		DEVX_SET(sched_context, attr,
			 max_average_bw, sched_attr->max_avg_bw);

	attr = DEVX_ADDR_OF(create_sched_elem_in, in,
			    sched_elem.sched_context.sched_elem_attr);
	DEVX_SET(sched_elem_attr_tsar, attr, tsar_type,
		 MLX5_SCHED_TSAR_TYPE_DWRR);

	ret = mlx5dv_devx_obj_modify(obj, in, sizeof(in), out, sizeof(out));
	return ret ? mlx5_get_cmd_status_err(ret, out) : 0;
}

#define MLX5DV_SCHED_ELEM_ATTR_ALL_FLAGS \
	(MLX5DV_SCHED_ELEM_ATTR_FLAGS_BW_SHARE |	\
	 MLX5DV_SCHED_ELEM_ATTR_FLAGS_MAX_AVG_BW)

static bool attr_supported(struct ibv_context *ctx,
			   const struct mlx5dv_sched_attr *attr)
{
	struct mlx5_qos_caps *qc = &to_mctx(ctx)->qos_caps;

	if ((attr->flags & MLX5DV_SCHED_ELEM_ATTR_FLAGS_BW_SHARE) &&
	    !qc->nic_bw_share)
		return false;
	if ((attr->flags & MLX5DV_SCHED_ELEM_ATTR_FLAGS_MAX_AVG_BW) &&
	    !qc->nic_rate_limit)
		return false;

	return true;
}

static bool sched_attr_valid(const struct mlx5dv_sched_attr *attr, bool node)
{
	if (!attr || attr->comp_mask ||
	    !check_comp_mask(attr->flags, MLX5DV_SCHED_ELEM_ATTR_ALL_FLAGS))
		return false;
	if (node && (!attr->parent && attr->flags))
		return false;
	if (!node && !attr->parent)
		return false;

	return true;
}

static struct mlx5dv_sched_node *
_mlx5dv_sched_node_create(struct ibv_context *ctx,
			   const struct mlx5dv_sched_attr *attr)
{
	struct mlx5dv_sched_node *node;
	struct mlx5dv_devx_obj *obj;

	if (!sched_attr_valid(attr, true)) {
		errno = EINVAL;
		return NULL;
	}

	if (!sched_supported(ctx) || !attr_supported(ctx, attr)) {
		errno = EOPNOTSUPP;
		return NULL;
	}

	node = calloc(1, sizeof(*node));
	if (!node) {
		errno = ENOMEM;
		return NULL;
	}

	obj = mlx5dv_sched_nic_create(ctx, attr, MLX5_SCHED_ELEM_TYPE_TSAR);
	if (!obj)
		goto err_sched_nic_create;

	node->obj = obj;
	node->parent = attr->parent;
	return node;

err_sched_nic_create:
	free(node);
	return NULL;
}

struct mlx5dv_sched_node *
mlx5dv_sched_node_create(struct ibv_context *ctx,
			 const struct mlx5dv_sched_attr *attr)
{
	struct mlx5_dv_context_ops *dvops = mlx5_get_dv_ops(ctx);

	if (!dvops || !dvops->sched_node_create) {
		errno = EOPNOTSUPP;
		return NULL;
	}

	return dvops->sched_node_create(ctx, attr);
}

static struct mlx5dv_sched_leaf *
_mlx5dv_sched_leaf_create(struct ibv_context *ctx,
			   const struct mlx5dv_sched_attr *attr)
{
	struct mlx5dv_sched_leaf *leaf;
	struct mlx5dv_devx_obj *obj;

	if (!sched_attr_valid(attr, false)) {
		errno = EINVAL;
		return NULL;
	}

	if (!attr_supported(ctx, attr)) {
		errno = EOPNOTSUPP;
		return NULL;
	}

	leaf = calloc(1, sizeof(*leaf));
	if (!leaf) {
		errno = ENOMEM;
		return NULL;
	}

	obj = mlx5dv_sched_nic_create(ctx, attr,
				      MLX5_SCHED_ELEM_TYPE_QUEUE_GROUP);
	if (!obj)
		goto err_sched_nic_create;

	leaf->obj = obj;
	leaf->parent = attr->parent;
	return leaf;

err_sched_nic_create:
	free(leaf);
	return NULL;
}

struct mlx5dv_sched_leaf *
mlx5dv_sched_leaf_create(struct ibv_context *ctx,
			 const struct mlx5dv_sched_attr *attr)
{
	struct mlx5_dv_context_ops *dvops = mlx5_get_dv_ops(ctx);

	if (!dvops || !dvops->sched_leaf_create) {
		errno = EOPNOTSUPP;
		return NULL;
	}

	return dvops->sched_leaf_create(ctx, attr);
}

static int _mlx5dv_sched_node_modify(struct mlx5dv_sched_node *node,
				     const struct mlx5dv_sched_attr *attr)
{
	if (!node || !sched_attr_valid(attr, true)) {
		errno = EINVAL;
		return errno;
	}

	if (!attr_supported(node->obj->context, attr)) {
		errno = EOPNOTSUPP;
		return errno;
	}

	return mlx5dv_sched_nic_modify(node->obj, attr,
				       MLX5_SCHED_ELEM_TYPE_TSAR);
}

int mlx5dv_sched_node_modify(struct mlx5dv_sched_node *node,
			     const struct mlx5dv_sched_attr *attr)
{
	struct mlx5_dv_context_ops *dvops = mlx5_get_dv_ops(node->obj->context);

	if (!dvops || !dvops->sched_node_modify)
		return EOPNOTSUPP;

	return dvops->sched_node_modify(node, attr);
}

static int _mlx5dv_sched_leaf_modify(struct mlx5dv_sched_leaf *leaf,
				     const struct mlx5dv_sched_attr *attr)
{
	if (!leaf || !sched_attr_valid(attr, false)) {
		errno = EINVAL;
		return errno;
	}

	if (!attr_supported(leaf->obj->context, attr)) {
		errno = EOPNOTSUPP;
		return errno;
	}

	return mlx5dv_sched_nic_modify(leaf->obj, attr,
				       MLX5_SCHED_ELEM_TYPE_QUEUE_GROUP);
}

int mlx5dv_sched_leaf_modify(struct mlx5dv_sched_leaf *leaf,
			     const struct mlx5dv_sched_attr *attr)
{
	struct mlx5_dv_context_ops *dvops = mlx5_get_dv_ops(leaf->obj->context);

	if (!dvops || !dvops->sched_leaf_modify)
		return EOPNOTSUPP;

	return dvops->sched_leaf_modify(leaf, attr);
}

static int _mlx5dv_sched_node_destroy(struct mlx5dv_sched_node *node)
{
	int ret;

	ret = mlx5dv_devx_obj_destroy(node->obj);
	if (ret)
		return ret;

	free(node);
	return 0;
}

int mlx5dv_sched_node_destroy(struct mlx5dv_sched_node *node)
{
	struct mlx5_dv_context_ops *dvops = mlx5_get_dv_ops(node->obj->context);

	if (!dvops || !dvops->sched_node_destroy)
		return EOPNOTSUPP;

	return dvops->sched_node_destroy(node);
}

static int _mlx5dv_sched_leaf_destroy(struct mlx5dv_sched_leaf *leaf)
{
	int ret;

	ret = mlx5dv_devx_obj_destroy(leaf->obj);
	if (ret)
		return ret;

	free(leaf);
	return 0;
}

int mlx5dv_sched_leaf_destroy(struct mlx5dv_sched_leaf *leaf)
{
	struct mlx5_dv_context_ops *dvops = mlx5_get_dv_ops(leaf->obj->context);

	if (!dvops || !dvops->sched_leaf_destroy)
		return EOPNOTSUPP;

	return dvops->sched_leaf_destroy(leaf);
}

static int modify_ib_qp_sched_elem_init(struct ibv_qp *qp,
					uint32_t req_id, uint32_t resp_id)
{
	uint64_t mask = MLX5_QPC_OPT_MASK_32_QOS_QUEUE_GROUP_ID;
	uint32_t in[DEVX_ST_SZ_DW(init2init_qp_in)] = {};
	uint32_t out[DEVX_ST_SZ_DW(init2init_qp_out)] = {};
	void *qpce = DEVX_ADDR_OF(init2init_qp_in, in, qpc_data_ext);
	int ret;

	DEVX_SET(init2init_qp_in, in, opcode, MLX5_CMD_OP_INIT2INIT_QP);
	DEVX_SET(init2init_qp_in, in, qpc_ext, 1);
	DEVX_SET(init2init_qp_in, in, qpn, qp->qp_num);
	DEVX_SET64(init2init_qp_in, in, opt_param_mask_95_32, mask);

	DEVX_SET(qpc_ext, qpce, qos_queue_group_id_requester, req_id);
	DEVX_SET(qpc_ext, qpce, qos_queue_group_id_responder, resp_id);

	ret = mlx5dv_devx_qp_modify(qp, in, sizeof(in), out, sizeof(out));
	return ret ? mlx5_get_cmd_status_err(ret, out) : 0;
}

static int modify_ib_qp_sched_elem_rts(struct ibv_qp *qp,
				       uint32_t req_id, uint32_t resp_id)
{
	uint64_t mask = MLX5_QPC_OPT_MASK_32_QOS_QUEUE_GROUP_ID;
	uint32_t in[DEVX_ST_SZ_DW(rts2rts_qp_in)] = {};
	uint32_t out[DEVX_ST_SZ_DW(rts2rts_qp_out)] = {};
	void *qpce = DEVX_ADDR_OF(rts2rts_qp_in, in, qpc_data_ext);
	int ret;

	DEVX_SET(rts2rts_qp_in, in, opcode, MLX5_CMD_OP_RTS2RTS_QP);
	DEVX_SET(rts2rts_qp_in, in, qpc_ext, 1);
	DEVX_SET(rts2rts_qp_in, in, qpn, qp->qp_num);
	DEVX_SET64(rts2rts_qp_in, in, opt_param_mask_95_32, mask);

	DEVX_SET(qpc_ext, qpce, qos_queue_group_id_requester, req_id);
	DEVX_SET(qpc_ext, qpce, qos_queue_group_id_responder, resp_id);

	ret = mlx5dv_devx_qp_modify(qp, in, sizeof(in), out, sizeof(out));
	return ret ? mlx5_get_cmd_status_err(ret, out) : 0;
}

static int modify_ib_qp_sched_elem(struct ibv_qp *qp,
				   uint32_t req_id, uint32_t resp_id)
{
	int ret;

	switch (qp->state) {
	case IBV_QPS_INIT:
		ret = modify_ib_qp_sched_elem_init(qp, req_id, resp_id);
		break;

	case IBV_QPS_RTS:
		ret = modify_ib_qp_sched_elem_rts(qp, req_id, resp_id);
		break;

	default:
		return EOPNOTSUPP;
	};

	return ret;
}

static int modify_raw_qp_sched_elem(struct ibv_qp *qp, uint32_t qos_id)
{
	struct mlx5_qos_caps *qc = &to_mctx(qp->context)->qos_caps;
	uint32_t mout[DEVX_ST_SZ_DW(modify_sq_out)] = {};
	uint32_t min[DEVX_ST_SZ_DW(modify_sq_in)] = {};
	struct mlx5_qp *mqp = to_mqp(qp);
	void *sqc;
	int ret;

	if (qp->state != IBV_QPS_RTS || !qc->nic_sq_scheduling)
		return EOPNOTSUPP;

	DEVX_SET(modify_sq_in, min, opcode, MLX5_CMD_OP_MODIFY_SQ);
	DEVX_SET(modify_sq_in, min, sq_state, MLX5_SQC_STATE_RDY);
	DEVX_SET(modify_sq_in, min, sqn, mqp->sqn);
	DEVX_SET64(modify_sq_in, min, modify_bitmask,
		   MLX5_MODIFY_SQ_BITMASK_QOS_QUEUE_GROUP_ID);
	sqc = DEVX_ADDR_OF(modify_sq_in, min, sq_context);
	DEVX_SET(sqc, sqc, state, MLX5_SQC_STATE_RDY);
	DEVX_SET(sqc, sqc, qos_queue_group_id, qos_id);

	ret = mlx5dv_devx_qp_modify(qp, min, sizeof(min), mout, sizeof(mout));
	return ret ? mlx5_get_cmd_status_err(ret, mout) : 0;
}

static int _mlx5dv_modify_qp_sched_elem(struct ibv_qp *qp,
					const struct mlx5dv_sched_leaf *requestor,
					const struct mlx5dv_sched_leaf *responder)
{
	struct mlx5_qos_caps *qc = &to_mctx(qp->context)->qos_caps;

	switch (qp->qp_type) {
	case IBV_QPT_UC:
	case IBV_QPT_UD:
		if (responder)
			return EINVAL;
		SWITCH_FALLTHROUGH;
	case IBV_QPT_RC:
		if ((!to_mctx(qp->context)->qpc_extension_cap) ||
		    !(qc->nic_qp_scheduling))
			return EOPNOTSUPP;
		return modify_ib_qp_sched_elem(qp,
					       requestor ? requestor->obj->object_id : 0,
					       responder ? responder->obj->object_id : 0);
	case IBV_QPT_RAW_PACKET:
		if (responder)
			return EINVAL;
		return modify_raw_qp_sched_elem(qp,
						requestor ? requestor->obj->object_id : 0);
	default:
		return EOPNOTSUPP;
	}
}

int mlx5dv_modify_qp_sched_elem(struct ibv_qp *qp,
				const struct mlx5dv_sched_leaf *requestor,
				const struct mlx5dv_sched_leaf *responder)
{
	struct mlx5_dv_context_ops *dvops = mlx5_get_dv_ops(qp->context);

	if (!dvops || !dvops->modify_qp_sched_elem)
		return EOPNOTSUPP;

	return dvops->modify_qp_sched_elem(qp, requestor, responder);
}

int mlx5_modify_qp_drain_sigerr(struct ibv_qp *qp)
{
	uint64_t mask = MLX5_QPC_OPT_MASK_INIT2INIT_DRAIN_SIGERR;
	uint32_t in[DEVX_ST_SZ_DW(init2init_qp_in)] = {};
	uint32_t out[DEVX_ST_SZ_DW(init2init_qp_out)] = {};
	void *qpc = DEVX_ADDR_OF(init2init_qp_in, in, qpc);
	int ret;

	DEVX_SET(init2init_qp_in, in, opcode, MLX5_CMD_OP_INIT2INIT_QP);
	DEVX_SET(init2init_qp_in, in, qpn, qp->qp_num);
	DEVX_SET(init2init_qp_in, in, opt_param_mask, mask);

	DEVX_SET(qpc, qpc, drain_sigerr, 1);

	ret = mlx5dv_devx_qp_modify(qp, in, sizeof(in), out, sizeof(out));
	return ret ? mlx5_get_cmd_status_err(ret, out) : 0;
}

static struct reserved_qpn_blk *reserved_qpn_blk_alloc(struct mlx5_context *mctx)
{
	uint32_t out[DEVX_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
	uint32_t in[DEVX_ST_SZ_DW(create_reserved_qpn_in)] = {};
	struct reserved_qpn_blk *blk;
	void *attr;

	blk = calloc(1, sizeof(*blk));
	if (!blk) {
		errno = ENOMEM;
		return NULL;
	}

	blk->bmp = bitmap_alloc0(1 << mctx->hca_cap_2_caps.log_reserved_qpns_per_obj);
	if (!blk->bmp) {
		errno = ENOMEM;
		goto bmp_alloc_fail;
	}

	attr = DEVX_ADDR_OF(create_reserved_qpn_in, in, hdr);
	DEVX_SET(general_obj_in_cmd_hdr,
		 attr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
	DEVX_SET(general_obj_in_cmd_hdr,
		 attr, obj_type, MLX5_OBJ_TYPE_RESERVED_QPN);
	DEVX_SET(general_obj_in_cmd_hdr,
		 attr, log_obj_range, mctx->hca_cap_2_caps.log_reserved_qpns_per_obj);

	blk->obj = mlx5dv_devx_obj_create(&mctx->ibv_ctx.context,
					  in, sizeof(in), out, sizeof(out));
	if (!blk->obj) {
		errno = mlx5_get_cmd_status_err(errno, out);
		goto obj_alloc_fail;
	}

	blk->first_qpn = blk->obj->object_id;
	blk->next_avail_slot = 0;

	return blk;

obj_alloc_fail:
	free(blk->bmp);

bmp_alloc_fail:
	free(blk);
	return NULL;
}

static void reserved_qpn_blk_dealloc(struct reserved_qpn_blk *blk)
{
	if (mlx5dv_devx_obj_destroy(blk->obj))
		assert(false);

	free(blk->bmp);
	free(blk);
}

static void reserved_qpn_blks_free(struct mlx5_context *mctx)
{
	struct reserved_qpn_blk *blk, *tmp;

	pthread_mutex_lock(&mctx->reserved_qpns.mutex);

	list_for_each_safe(&mctx->reserved_qpns.blk_list,
			   blk, tmp, entry) {
		list_del(&blk->entry);
		reserved_qpn_blk_dealloc(blk);
	}

	pthread_mutex_unlock(&mctx->reserved_qpns.mutex);
}

/**
 * Allocate a reserved QPN either from the last FW object allocated,
 * or by allocating a new one. When find a free QPN in an object, it
 * always starts from last allocation position, to make sure the QPN
 * always move forward to prevent stale QPN.
 */
static int _mlx5dv_reserved_qpn_alloc(struct ibv_context *ctx, uint32_t *qpn)
{
	struct mlx5_context *mctx = to_mctx(ctx);
	struct reserved_qpn_blk *blk;
	uint32_t qpns_per_obj;
	int ret = 0;

	if (!(mctx->general_obj_types_caps & (1ULL << MLX5_OBJ_TYPE_RESERVED_QPN)))
		return EOPNOTSUPP;

	qpns_per_obj = 1 << mctx->hca_cap_2_caps.log_reserved_qpns_per_obj;

	pthread_mutex_lock(&mctx->reserved_qpns.mutex);

	blk = list_tail(&mctx->reserved_qpns.blk_list,
			struct reserved_qpn_blk, entry);
	if (!blk ||
	    (blk->next_avail_slot >= qpns_per_obj)) {
		blk = reserved_qpn_blk_alloc(mctx);
		if (!blk) {
			ret = errno;
			goto end;
		}
		list_add_tail(&mctx->reserved_qpns.blk_list, &blk->entry);
	}

	*qpn = blk->first_qpn + blk->next_avail_slot;
	bitmap_set_bit(blk->bmp, blk->next_avail_slot);
	blk->next_avail_slot++;

end:
	pthread_mutex_unlock(&mctx->reserved_qpns.mutex);
	return ret;
}

int mlx5dv_reserved_qpn_alloc(struct ibv_context *ctx, uint32_t *qpn)
{
	struct mlx5_dv_context_ops *dvops = mlx5_get_dv_ops(ctx);

	if (!dvops || !dvops->reserved_qpn_alloc)
		return EOPNOTSUPP;

	return dvops->reserved_qpn_alloc(ctx, qpn);
}

/**
 * Deallocate a reserved QPN. The FW object is destroyed only when all QPNs
 * in this object were used and freed.
 */
static int _mlx5dv_reserved_qpn_dealloc(struct ibv_context *ctx, uint32_t qpn)
{
	struct mlx5_context *mctx = to_mctx(ctx);
	struct reserved_qpn_blk *blk, *tmp;
	uint32_t qpns_per_obj;
	bool found = false;
	int ret = 0;

	qpns_per_obj = 1 << mctx->hca_cap_2_caps.log_reserved_qpns_per_obj;

	pthread_mutex_lock(&mctx->reserved_qpns.mutex);

	list_for_each_safe(&mctx->reserved_qpns.blk_list,
			   blk, tmp, entry) {
		if ((qpn >= blk->first_qpn) &&
		    (qpn < blk->first_qpn + qpns_per_obj)) {
			found = true;
			break;
		}
	}

	if (!found || !bitmap_test_bit(blk->bmp, qpn - blk->first_qpn)) {
		errno = EINVAL;
		ret = errno;
		goto end;
	}

	bitmap_clear_bit(blk->bmp, qpn - blk->first_qpn);
	if ((blk->next_avail_slot >= qpns_per_obj) &&
	    (bitmap_empty(blk->bmp, qpns_per_obj))) {
		list_del(&blk->entry);
		reserved_qpn_blk_dealloc(blk);
	}

end:
	pthread_mutex_unlock(&mctx->reserved_qpns.mutex);
	return ret;
}

int mlx5dv_reserved_qpn_dealloc(struct ibv_context *ctx, uint32_t qpn)
{
	struct mlx5_dv_context_ops *dvops = mlx5_get_dv_ops(ctx);

	if (!dvops || !dvops->reserved_qpn_dealloc)
		return EOPNOTSUPP;

	return dvops->reserved_qpn_dealloc(ctx, qpn);
}

static int _mlx5dv_init_obj(struct mlx5dv_obj *obj, uint64_t obj_type)
{
	int ret = 0;

	if (obj_type & MLX5DV_OBJ_QP)
		ret = mlx5dv_get_qp(obj->qp.in, obj->qp.out);
	if (!ret && (obj_type & MLX5DV_OBJ_CQ))
		ret = mlx5dv_get_cq(obj->cq.in, obj->cq.out);
	if (!ret && (obj_type & MLX5DV_OBJ_SRQ))
		ret = mlx5dv_get_srq(obj->srq.in, obj->srq.out);
	if (!ret && (obj_type & MLX5DV_OBJ_RWQ))
		ret = mlx5dv_get_rwq(obj->rwq.in, obj->rwq.out);
	if (!ret && (obj_type & MLX5DV_OBJ_DM))
		ret = mlx5dv_get_dm(obj->dm.in, obj->dm.out);
	if (!ret && (obj_type & MLX5DV_OBJ_AH))
		ret = mlx5dv_get_av(obj->ah.in, obj->ah.out);
	if (!ret && (obj_type & MLX5DV_OBJ_PD))
		ret = mlx5dv_get_pd(obj->pd.in, obj->pd.out);
	if (!ret && (obj_type & MLX5DV_OBJ_DEVX))
		ret = mlx5dv_get_devx(obj->devx.in, obj->devx.out);

	return ret;
}

static struct ibv_context *
get_context_from_obj(struct mlx5dv_obj *obj, uint64_t obj_type)
{
	if (obj_type & MLX5DV_OBJ_QP)
		return obj->qp.in->context;
	if (obj_type & MLX5DV_OBJ_CQ)
		return obj->cq.in->context;
	if (obj_type & MLX5DV_OBJ_SRQ)
		return obj->srq.in->context;
	if (obj_type & MLX5DV_OBJ_RWQ)
		return obj->rwq.in->context;
	if (obj_type & MLX5DV_OBJ_DM)
		return obj->dm.in->context;
	if (obj_type & MLX5DV_OBJ_AH)
		return obj->ah.in->context;
	if (obj_type & MLX5DV_OBJ_PD)
		return obj->pd.in->context;
	if (obj_type & MLX5DV_OBJ_DEVX)
		return obj->devx.in->context;

	return NULL;
}

LATEST_SYMVER_FUNC(mlx5dv_init_obj, 1_2, "MLX5_1.2",
		   int,
		   struct mlx5dv_obj *obj, uint64_t obj_type)
{
	struct mlx5_dv_context_ops *dvops;
	struct ibv_context *ctx;

	ctx = get_context_from_obj(obj, obj_type);
	if (!ctx)
		return EINVAL;

	dvops = mlx5_get_dv_ops(ctx);

	if (!dvops || !dvops->init_obj)
		return EOPNOTSUPP;

	return dvops->init_obj(obj, obj_type);
}

COMPAT_SYMVER_FUNC(mlx5dv_init_obj, 1_0, "MLX5_1.0",
		   int,
		   struct mlx5dv_obj *obj, uint64_t obj_type)
{
	int ret = 0;

	ret = __mlx5dv_init_obj_1_2(obj, obj_type);
	if (!ret && (obj_type & MLX5DV_OBJ_CQ)) {
		/* ABI version 1.0 returns the void ** in this memory
		 * location
		 */
		obj->cq.out->cq_uar = &(to_mctx(obj->cq.in->context)->cq_uar_reg);
	}
	return ret;
}

off_t get_uar_mmap_offset(int idx, int page_size, int command)
{
	off_t offset = 0;

	set_command(command, &offset);

	if (command == MLX5_IB_MMAP_ALLOC_WC &&
	    idx >= (1 << MLX5_IB_MMAP_CMD_SHIFT))
		set_extended_index(idx, &offset);
	else
		set_index(idx, &offset);

	return offset * page_size;
}

static off_t uar_type_to_cmd(int uar_type)
{
	return (uar_type == MLX5_UAR_TYPE_NC) ? MLX5_MMAP_GET_NC_PAGES_CMD :
		MLX5_MMAP_GET_REGULAR_PAGES_CMD;
}

void *mlx5_mmap(struct mlx5_uar_info *uar, int index, int cmd_fd, int page_size,
		int uar_type)
{
	off_t offset;

	if (uar_type == MLX5_UAR_TYPE_NC) {
		offset = get_uar_mmap_offset(index, page_size,
					     MLX5_MMAP_GET_NC_PAGES_CMD);
		uar->reg = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED,
				       cmd_fd, offset);
		if (uar->reg != MAP_FAILED) {
			uar->type = MLX5_UAR_TYPE_NC;
			goto out;
		}
	}

	/* Backward compatibility for legacy kernels that don't support
	 * MLX5_MMAP_GET_NC_PAGES_CMD mmap command.
	 */
	offset = get_uar_mmap_offset(index, page_size,
				     (uar_type == MLX5_UAR_TYPE_REGULAR_DYN) ?
				     MLX5_IB_MMAP_ALLOC_WC :
				     MLX5_MMAP_GET_REGULAR_PAGES_CMD);
	uar->reg = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED,
			cmd_fd, offset);
	if (uar->reg != MAP_FAILED)
		uar->type = MLX5_UAR_TYPE_REGULAR;

out:
	return uar->reg;
}

static int _mlx5dv_set_context_attr(struct ibv_context *ibv_ctx,
				    enum mlx5dv_set_ctx_attr_type type,
				    void *attr)
{
	struct mlx5_context *ctx = to_mctx(ibv_ctx);

	switch (type) {
	case MLX5DV_CTX_ATTR_BUF_ALLOCATORS:
		ctx->extern_alloc = *((struct mlx5dv_ctx_allocators *)attr);
		break;
	default:
		return ENOTSUP;
	}

	return 0;
}

int mlx5dv_set_context_attr(struct ibv_context *ibv_ctx,
			    enum mlx5dv_set_ctx_attr_type type, void *attr)
{
	struct mlx5_dv_context_ops *dvops = mlx5_get_dv_ops(ibv_ctx);

	if (!dvops || !dvops->set_context_attr)
		return EOPNOTSUPP;

	return dvops->set_context_attr(ibv_ctx, type, attr);
}

static int _mlx5dv_get_clock_info(struct ibv_context *ctx_in,
				  struct mlx5dv_clock_info *clock_info)
{
	struct mlx5_context *ctx = to_mctx(ctx_in);
	const struct mlx5_ib_clock_info *ci;
	uint32_t retry, tmp_sig;
	atomic_uint32_t *sig;

	if (!is_mlx5_dev(ctx_in->device))
		return EOPNOTSUPP;

	ci = ctx->clock_info_page;

	if (!ci)
		return EINVAL;

	sig = (atomic_uint32_t *)&ci->sign;

	do {
		retry = 10;
repeat:
		tmp_sig = atomic_load(sig);
		if (unlikely(tmp_sig &
			     MLX5_IB_CLOCK_INFO_KERNEL_UPDATING)) {
			if (--retry)
				goto repeat;
			return EBUSY;
		}
		clock_info->nsec   = ci->nsec;
		clock_info->last_cycles = ci->cycles;
		clock_info->frac   = ci->frac;
		clock_info->mult   = ci->mult;
		clock_info->shift  = ci->shift;
		clock_info->mask   = ci->mask;
	} while (unlikely(tmp_sig != atomic_load(sig)));

	return 0;
}

int mlx5dv_get_clock_info(struct ibv_context *ctx_in,
			  struct mlx5dv_clock_info *clock_info)
{
	struct mlx5_dv_context_ops *dvops = mlx5_get_dv_ops(ctx_in);

	if (!dvops || !dvops->get_clock_info)
		return EOPNOTSUPP;

	return dvops->get_clock_info(ctx_in, clock_info);
}

static struct mlx5_dv_context_ops mlx5_dv_ctx_ops = {
	.query_device = _mlx5dv_query_device,

	.query_qp_lag_port = _mlx5dv_query_qp_lag_port,
	.modify_qp_lag_port = _mlx5dv_modify_qp_lag_port,

	.modify_qp_udp_sport = _mlx5dv_modify_qp_udp_sport,

	.sched_node_create = _mlx5dv_sched_node_create,
	.sched_leaf_create = _mlx5dv_sched_leaf_create,
	.sched_node_modify = _mlx5dv_sched_node_modify,
	.sched_leaf_modify = _mlx5dv_sched_leaf_modify,
	.sched_node_destroy = _mlx5dv_sched_node_destroy,
	.sched_leaf_destroy = _mlx5dv_sched_leaf_destroy,
	.modify_qp_sched_elem = _mlx5dv_modify_qp_sched_elem,

	.reserved_qpn_alloc = _mlx5dv_reserved_qpn_alloc,
	.reserved_qpn_dealloc = _mlx5dv_reserved_qpn_dealloc,

	.set_context_attr = _mlx5dv_set_context_attr,
	.get_clock_info = _mlx5dv_get_clock_info,
	.init_obj = _mlx5dv_init_obj,
};

static void adjust_uar_info(struct mlx5_device *mdev,
			    struct mlx5_context *context,
			    struct mlx5_ib_alloc_ucontext_resp *resp)
{
	if (!resp->log_uar_size && !resp->num_uars_per_page) {
		/* old kernel */
		context->uar_size = mdev->page_size;
		context->num_uars_per_page = 1;
		return;
	}

	context->uar_size = 1 << resp->log_uar_size;
	context->num_uars_per_page = resp->num_uars_per_page;
}

bool mlx5dv_is_supported(struct ibv_device *device)
{
	return is_mlx5_dev(device);
}

struct ibv_context *
mlx5dv_open_device(struct ibv_device *device, struct mlx5dv_context_attr *attr)
{
	if (!is_mlx5_dev(device)) {
		errno = EOPNOTSUPP;
		return NULL;
	}

	return verbs_open_device(device, attr);
}

static int get_uar_info(struct mlx5_device *mdev,
			int *tot_uuars, int *low_lat_uuars)
{
	*tot_uuars = get_total_uuars(mdev->page_size);
	if (*tot_uuars < 0) {
		errno = -*tot_uuars;
		return -1;
	}

	*low_lat_uuars = get_num_low_lat_uuars(*tot_uuars);
	if (*low_lat_uuars < 0) {
		errno = -*low_lat_uuars;
		return -1;
	}

	if (*low_lat_uuars > *tot_uuars - 1) {
		errno = ENOMEM;
		return -1;
	}

	return 0;
}

static void mlx5_uninit_context(struct mlx5_context *context)
{
	mlx5_close_debug_file(context->dbg_fp);

	verbs_uninit_context(&context->ibv_ctx);
	free(context);
}

static struct mlx5_context *mlx5_init_context(struct ibv_device *ibdev,
						int cmd_fd)
{
	struct mlx5_device *mdev = to_mdev(ibdev);
	struct mlx5_context *context;
	int low_lat_uuars;
	int tot_uuars;
	int ret;

	context = verbs_init_and_alloc_context(ibdev, cmd_fd, context, ibv_ctx,
					       RDMA_DRIVER_MLX5);
	if (!context)
		return NULL;

	mlx5_open_debug_file(&context->dbg_fp);
	mlx5_set_debug_mask();
	set_freeze_on_error();
	if (gethostname(context->hostname, sizeof(context->hostname)))
		strcpy(context->hostname, "host_unknown");

	mlx5_single_threaded = single_threaded_app();

	ret = get_uar_info(mdev, &tot_uuars, &low_lat_uuars);
	if (ret) {
		mlx5_uninit_context(context);
		return NULL;
	}
	context->tot_uuars = tot_uuars;
	context->low_lat_uuars = low_lat_uuars;

	return context;
}

static int mlx5_set_context(struct mlx5_context *context,
			    struct mlx5_ib_alloc_ucontext_resp *resp,
			    bool is_import)
{
	struct verbs_context *v_ctx = &context->ibv_ctx;
	struct ibv_port_attr port_attr = {};
	int cmd_fd = v_ctx->context.cmd_fd;
	struct mlx5_device *mdev = to_mdev(v_ctx->context.device);
	struct ibv_device *ibdev = v_ctx->context.device;
	int page_size = mdev->page_size;
	int num_sys_page_map;
	int gross_uuars;
	int bfi;
	int i, k, j;

	context->max_num_qps = resp->qp_tab_size;
	context->bf_reg_size = resp->bf_reg_size;
	context->cache_line_size = resp->cache_line_size;
	context->max_sq_desc_sz = resp->max_sq_desc_sz;
	context->max_rq_desc_sz = resp->max_rq_desc_sz;
	context->max_send_wqebb	= resp->max_send_wqebb;
	context->num_ports = resp->num_ports;
	context->max_recv_wr = resp->max_recv_wr;
	context->max_srq_recv_wr = resp->max_srq_recv_wr;
	context->num_dyn_bfregs = resp->num_dyn_bfregs;

	if (resp->comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_ECE)
		context->flags |= MLX5_CTX_FLAGS_ECE_SUPPORTED;

	if (resp->comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_SQD2RTS)
		context->flags |= MLX5_CTX_FLAGS_SQD2RTS_SUPPORTED;

	if (resp->comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_REAL_TIME_TS)
		context->flags |= MLX5_CTX_FLAGS_REAL_TIME_TS_SUPPORTED;

	if (resp->comp_mask &
	    MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_MKEY_UPDATE_TAG)
		context->flags |= MLX5_CTX_FLAGS_MKEY_UPDATE_TAG_SUPPORTED;

	if (resp->comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_DUMP_FILL_MKEY) {
		context->dump_fill_mkey = resp->dump_fill_mkey;
		/* Have the BE value ready to be used in data path */
		context->dump_fill_mkey_be = htobe32(resp->dump_fill_mkey);
	} else {
		/* kernel driver will never return MLX5_INVALID_LKEY for
		 * dump_fill_mkey
		 */
		context->dump_fill_mkey = MLX5_INVALID_LKEY;
		context->dump_fill_mkey_be = htobe32(MLX5_INVALID_LKEY);
	}

	context->cqe_version = resp->cqe_version;
	adjust_uar_info(mdev, context, resp);

	context->cmds_supp_uhw = resp->cmds_supp_uhw;
	context->vendor_cap_flags = 0;
	list_head_init(&context->dyn_uar_bf_list);
	list_head_init(&context->dyn_uar_db_list);
	list_head_init(&context->dyn_uar_qp_shared_list);
	list_head_init(&context->dyn_uar_qp_dedicated_list);

	if (resp->eth_min_inline)
		context->eth_min_inline_size = (resp->eth_min_inline == MLX5_USER_INLINE_MODE_NONE) ?
						0 : MLX5_ETH_L2_INLINE_HEADER_SIZE;
	else
		context->eth_min_inline_size = MLX5_ETH_L2_INLINE_HEADER_SIZE;

	pthread_mutex_init(&context->qp_table_mutex, NULL);
	pthread_mutex_init(&context->srq_table_mutex, NULL);
	pthread_mutex_init(&context->uidx_table_mutex, NULL);
	pthread_mutex_init(&context->mkey_table_mutex, NULL);
	pthread_mutex_init(&context->dyn_bfregs_mutex, NULL);
	pthread_mutex_init(&context->crypto_login_mutex, NULL);
	for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i)
		context->qp_table[i].refcnt = 0;

	for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i)
		context->uidx_table[i].refcnt = 0;

	for (i = 0; i < MLX5_MKEY_TABLE_SIZE; ++i)
		context->mkey_table[i].refcnt = 0;

	list_head_init(&context->dbr_available_pages);
	cl_qmap_init(&context->dbr_map);

	pthread_mutex_init(&context->dbr_map_mutex, NULL);

	context->prefer_bf = get_always_bf();
	context->shut_up_bf = get_shut_up_bf();

	if (resp->tot_bfregs) {
		if (is_import) {
			errno = EINVAL;
			return EINVAL;
		}
		context->tot_uuars = resp->tot_bfregs;
		gross_uuars = context->tot_uuars / MLX5_NUM_NON_FP_BFREGS_PER_UAR * NUM_BFREGS_PER_UAR;
		context->bfs = calloc(gross_uuars, sizeof(*context->bfs));
		if (!context->bfs) {
			errno = ENOMEM;
			goto err_free;
		}
		context->flags |= MLX5_CTX_FLAGS_NO_KERN_DYN_UAR;
	} else {
		context->qp_max_dedicated_uuars = context->low_lat_uuars;
		context->qp_max_shared_uuars = context->tot_uuars - context->low_lat_uuars;
		goto bf_done;
	}

	context->max_num_legacy_dyn_uar_sys_page = context->num_dyn_bfregs /
			(context->num_uars_per_page * MLX5_NUM_NON_FP_BFREGS_PER_UAR);
	num_sys_page_map = context->tot_uuars / (context->num_uars_per_page * MLX5_NUM_NON_FP_BFREGS_PER_UAR);
	for (i = 0; i < num_sys_page_map; ++i) {
		if (mlx5_mmap(&context->uar[i], i, cmd_fd, page_size,
			      context->shut_up_bf ? MLX5_UAR_TYPE_NC :
			      MLX5_UAR_TYPE_REGULAR) == MAP_FAILED) {
			context->uar[i].reg = NULL;
			goto err_free_bf;
		}
	}

	for (i = 0; i < num_sys_page_map; i++) {
		for (j = 0; j < context->num_uars_per_page; j++) {
			for (k = 0; k < NUM_BFREGS_PER_UAR; k++) {
				bfi = (i * context->num_uars_per_page + j) * NUM_BFREGS_PER_UAR + k;
				context->bfs[bfi].reg = context->uar[i].reg + MLX5_ADAPTER_PAGE_SIZE * j +
							MLX5_BF_OFFSET + k * context->bf_reg_size;
				context->bfs[bfi].need_lock = need_uuar_lock(context, bfi);
				mlx5_spinlock_init(&context->bfs[bfi].lock, context->bfs[bfi].need_lock);
				context->bfs[bfi].offset = 0;
				if (bfi)
					context->bfs[bfi].buf_size = context->bf_reg_size / 2;
				context->bfs[bfi].uuarn = bfi;
				context->bfs[bfi].uar_mmap_offset =
					get_uar_mmap_offset(i, page_size,
							uar_type_to_cmd(context->uar[i].type));
			}
		}
	}

bf_done:

	context->hca_core_clock = NULL;
	if (resp->comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET) {
		context->core_clock.offset = resp->hca_core_clock_offset;
		mlx5_map_internal_clock(mdev, &v_ctx->context);
	}

	context->clock_info_page = NULL;
	if ((resp->clock_info_versions & (1 << MLX5_IB_CLOCK_INFO_V1)))
		mlx5_map_clock_info(mdev, &v_ctx->context);

	context->flow_action_flags = resp->flow_action_flags;

	mlx5_read_env(ibdev, context);

	mlx5_spinlock_init(&context->hugetlb_lock, !mlx5_single_threaded);
	list_head_init(&context->hugetlb_list);

	verbs_set_ops(v_ctx, &mlx5_ctx_common_ops);
	if (context->cqe_version) {
		if (context->cqe_version == MLX5_CQE_VERSION_V1)
			verbs_set_ops(v_ctx, &mlx5_ctx_cqev1_ops);
		else
			goto err_free;
	}
	context->dv_ctx_ops = &mlx5_dv_ctx_ops;

	mlx5_query_device_ctx(context);

	for (j = 0; j < min(MLX5_MAX_PORTS_NUM, context->num_ports); ++j) {
		memset(&port_attr, 0, sizeof(port_attr));
		if (!mlx5_query_port(&v_ctx->context, j + 1, &port_attr)) {
			context->cached_link_layer[j] = port_attr.link_layer;
			context->cached_port_flags[j] = port_attr.flags;
		}
	}

	mlx5_set_singleton_nc_uar(&v_ctx->context);
	context->cq_uar_reg = context->nc_uar ? context->nc_uar->uar : context->uar[0].reg;

	pthread_mutex_init(&context->reserved_qpns.mutex, NULL);
	list_head_init(&context->reserved_qpns.blk_list);

	return 0;

err_free_bf:
	free(context->bfs);

err_free:
	for (i = 0; i < MLX5_MAX_UARS; ++i) {
		if (context->uar[i].reg)
			munmap(context->uar[i].reg, page_size);
	}

	return -1;
}

static struct verbs_context *mlx5_alloc_context(struct ibv_device *ibdev,
						int cmd_fd,
						void *private_data)
{
	struct mlx5_context	       *context;
	struct mlx5_alloc_ucontext	req = {};
	struct mlx5_alloc_ucontext_resp resp = {};
	struct mlx5dv_context_attr      *ctx_attr = private_data;
	bool				always_devx = false;
	struct ibv_fd_arr               *fds = NULL;
	int ret;

	context = mlx5_init_context(ibdev, cmd_fd);
	if (!context)
		return NULL;

	if (ctx_attr && ctx_attr->comp_mask) {
		if (!check_comp_mask(ctx_attr->comp_mask,
				     MLX5DV_CONTEXT_ATTR_MASK_FD_ARRAY)) {
			errno = EINVAL;
			goto err;
		}
		fds = ctx_attr->fds;
	}

	req.total_num_bfregs = context->tot_uuars;
	req.num_low_latency_bfregs = context->low_lat_uuars;
	req.max_cqe_version = MLX5_CQE_VERSION_V1;
	req.lib_caps |= (MLX5_LIB_CAP_4K_UAR | MLX5_LIB_CAP_DYN_UAR);
	if (ctx_attr && ctx_attr->flags) {

		if (!check_comp_mask(ctx_attr->flags,
				     MLX5DV_CONTEXT_FLAGS_DEVX)) {
			errno = EINVAL;
			goto err;
		}

		req.flags = MLX5_IB_ALLOC_UCTX_DEVX;
	} else {
		req.flags = MLX5_IB_ALLOC_UCTX_DEVX;
		always_devx = true;
	}

retry_open:
	if (mlx5_cmd_get_context(context, &req, sizeof(req), fds, &resp,
				 sizeof(resp))) {
		if (always_devx) {
			req.flags &= ~MLX5_IB_ALLOC_UCTX_DEVX;
			always_devx = false;
			memset(&resp, 0, sizeof(resp));
			goto retry_open;
		} else {
			goto err;
		}
	}

	ret = mlx5_set_context(context, &resp.drv_payload, false);
	if (ret)
		goto err;

	return &context->ibv_ctx;

err:
	mlx5_uninit_context(context);
	return NULL;
}

static struct verbs_context *mlx5_import_context(struct ibv_device *ibdev,
						int cmd_fd)

{
	struct mlx5_ib_alloc_ucontext_resp resp = {};
	DECLARE_COMMAND_BUFFER_LINK(driver_attr, UVERBS_OBJECT_DEVICE,
				    UVERBS_METHOD_QUERY_CONTEXT, 1,
				    NULL);
	struct ibv_context *context;
	struct mlx5_context *mctx;
	int ret;

	mctx = mlx5_init_context(ibdev, cmd_fd);
	if (!mctx)
		return NULL;

	context = &mctx->ibv_ctx.context;

	fill_attr_out_ptr(driver_attr, MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX, &resp);
	ret = ibv_cmd_query_context(context, driver_attr);
	if (ret)
		goto err;

	ret = mlx5_set_context(mctx, &resp, true);
	if (ret)
		goto err;

	return &mctx->ibv_ctx;

err:
	mlx5_uninit_context(mctx);
	return NULL;
}

static void mlx5_free_context(struct ibv_context *ibctx)
{
	struct mlx5_context *context = to_mctx(ibctx);
	int page_size = to_mdev(ibctx->device)->page_size;
	int i;

	free(context->bfs);
	for (i = 0; i < MLX5_MAX_UARS; ++i) {
		if (context->uar[i].reg)
			munmap(context->uar[i].reg, page_size);
	}
	if (context->hca_core_clock)
		munmap(context->hca_core_clock - context->core_clock.offset,
		       page_size);
	if (context->clock_info_page)
		munmap((void *)context->clock_info_page, page_size);
	mlx5_close_debug_file(context->dbg_fp);
	clean_dyn_uars(ibctx);
	reserved_qpn_blks_free(context);

	verbs_uninit_context(&context->ibv_ctx);
	free(context);
}

static void mlx5_uninit_device(struct verbs_device *verbs_device)
{
	struct mlx5_device *dev = to_mdev(&verbs_device->device);

	free(dev);
}

static struct verbs_device *mlx5_device_alloc(struct verbs_sysfs_dev *sysfs_dev)
{
	struct mlx5_device *dev;

	dev = calloc(1, sizeof *dev);
	if (!dev)
		return NULL;

	dev->page_size   = sysconf(_SC_PAGESIZE);
	dev->driver_abi_ver = sysfs_dev->abi_ver;

	mlx5_set_dv_ctx_ops(&mlx5_dv_ctx_ops);
	return &dev->verbs_dev;
}

static const struct verbs_device_ops mlx5_dev_ops = {
	.name = "mlx5",
	.match_min_abi_version = MLX5_UVERBS_MIN_ABI_VERSION,
	.match_max_abi_version = MLX5_UVERBS_MAX_ABI_VERSION,
	.match_table = mlx5_hca_table,
	.alloc_device = mlx5_device_alloc,
	.uninit_device = mlx5_uninit_device,
	.alloc_context = mlx5_alloc_context,
	.import_context = mlx5_import_context,
};

static bool is_mlx5_dev(struct ibv_device *device)
{
	struct verbs_device *verbs_device = verbs_get_device(device);

	return verbs_device->ops == &mlx5_dev_ops;
}

struct mlx5_dv_context_ops *mlx5_get_dv_ops(struct ibv_context *ibctx)
{
	if (is_mlx5_dev(ibctx->device))
		return to_mctx(ibctx)->dv_ctx_ops;
	else if (is_mlx5_vfio_dev(ibctx->device))
		return to_mvfio_ctx(ibctx)->dv_ctx_ops;
	else
		return NULL;
}
PROVIDER_DRIVER(mlx5, mlx5_dev_ops);
